#!/bin/bash #=============================================================================== # keeps a list of URLs with a description # and makes their contents searchable with an index # # uses curl #=============================================================================== # config data="$HOME/.lix" timeout=120 stopwords=" http www com org de nbsp not and or of to in on by at as from for do is be are can if an it so no we the this that with html" # constants tab=$'\t' linkfile="$data/links.txt" indexfile="$data/index.txt" # init data directories mkdir -p "$data" touch "$linkfile" touch "$indexfile" #------------------------------------------------------------------------------- ## helper # print out user info function status() { echo >&2 "$@" } # awk using TAB as a record separator function tawk() { awk -v FS=$'\t' -v OFS=$'\t' "$@" } # fetch contents of an url function fetch() { local url="$1" curl -m $timeout -s "$url" # -f fails silently } # extracts words from a string function decompose() { # no linefeeds tr '\n' ' ' | tr '\r' ' ' | # no tags sed 's/<[^>]*>/ /g' | # only alphanum and underscore allowed sed 's/[^a-zA-Z0-9äöüßÄÖÜ_]/ /g' | # lowercase everything tr '[:upper:]' '[:lower:]' | # no unnecessary whitespace sed 's/ */ /g;s/^ //;s/ $//' | # one word per line tr ' ' '\n' | # minimum 2 characters grep '..' | # no stopwords grep -vxF "$stopwords" | # no duplicates sort -u } # get the title from a html file title() { local fulltext="$1" echo "$fulltext" | awk ' index($0, "") { out=1 } out { print } index($0, "") { out=0 } ' | tr '\n' ' ' | tr '\r' ' ' | sed 's#.*\(.*\).*#\1#' | sed 's/ */ /g;s/^ //;s/ $//' } #------------------------------------------------------------------------------- ## list stuff # add entry to the list function list_add() { local url="$1" local description="$2" local title="$3" local fulltext="$4" local date="$5" echo >>"$linkfile" "$url$tab$title$tab$description$tab$date" } # get entries from the list function list_get() { [ $# != 0 ] && { echo "$@" | list_get; return $?; } ### files would have to be sorted #join -t "$tab" "$linkfile" - while read -r url; do tawk -v url="$url" '$1 == url { print }' <"$linkfile" done } # remove an url from the list function list_remove() { #[ $# != 0 ] && { echo "$@" | list_remove; return $?; } ### files would have to be sorted #join -t "$tab" -v 1 "$linkfile" - >"$linkfile.tmp" #mv "$linkfile.tmp" "$linkfile" while read -r url; do tawk -v url="$url" '$1 != url { print }' <"$linkfile" >"$linkfile.tmp" mv "$linkfile.tmp" "$linkfile" done } # returns true when the url exists in the list function list_contains() { local url="$1" [ -n "$(list_get "$url")" ] } #------------------------------------------------------------------------------- ## index stuff # add entry to the index function index_add() { local url="$1" local description="$2" local title="$3" local fulltext="$4" local date="$5" local word map ( echo "$url" echo "$title" echo "$description" echo "$fulltext" ) | decompose | while read -r word; do echo "$word$tab$url" >>"$indexfile" done } # remove urls from the index function index_remove() { #[ $# != 0 ] && { echo "$@" | index_remove; return $?; } ### files would have to be sorted #join -t "$tab" -v 1 -1 2 -2 1 "$indexfile" - >"$indexfile.tmp" #mv "$indexfile.tmp" "$indexfile" while read -r url; do tawk -v url="$url" '$2 != url { print }' <"$indexfile" >"$indexfile.tmp" mv "$indexfile.tmp" "$indexfile" done } # create a histogramm of all words in the index function index_histogramm() { cat "$indexfile" | tawk '{print $1}' | sort | uniq -c | sort -n | sed 's/^ *\([0-9]*\) */\1\t/' } # find urls in the index function index_search() { local p_count=0 for word; do local short="${word#-}" [ "$word" == "$short" ] || continue index_search_word "$short" || { status "excluded $short"; continue; } ((p_count++)) done >"$data/p_list" #local n_count=0 for word; do local short="${word#-}" [ "$word" != "$short" ] || continue index_search_word "$short" || { status "excluded $short"; continue; } #((n_count++)) done >"$data/n_list" # get urls found for all +words and not found for any -word comm -2 -3 <( # all files occuring p_count times in p_list cat "$data/p_list" | sort | uniq -c | grep "^ *$p_count *" | sed 's/^ *[0-9]* *//' ) <( # all files occuring more than 0 times in n_list cat "$data/n_list" | sort -u ) # cleanup # rm "$data/n_list" "$data/p_list" } # find a single word in the index (output is sorted) function index_search_word() { local word=$( echo "$1" | tr '[:upper:]' '[:lower:]' ) local urls=$( tawk -v word="$word" 'index($1, word) { print $2 }' <"$indexfile" | sort -u ) echo "$urls" [ -n "$urls" ] } #------------------------------------------------------------------------------- ## public functions # add a link to the index function add() { local description="" [ "$1" = "-d" ] && { description="$2"; shift 2; } [ $# -eq 0 ] && { while read -r url; do add -d "$description" "$url"; done; } for url; do ### BÄH - slow! list_contains "$url" && { status "skipping already indexed $url"; continue; } status "indexing $url" local fulltext=$(fetch "$url") || { status "cannot download $url"; continue; } local title=$(title "$fulltext") local date=$(date +%s) list_add "$url" "$description" "$title" "$fulltext" "$date" index_add "$url" "$description" "$title" "$fulltext" "$date" done } # remove a link from the index function remove() { [ $# -eq 0 ] && { while read -r url; do add -d "$description"; done; } for url; do status "deindexing $url" index_remove "$url" list_remove "$url" done } # search links by words from the index function search() { [ $# -eq 0 ] && { status "error: wrong number of arguments"; return 1; } index_search "$@" | list_get #| tawk '{print $1 "\n" $3 "\n"}' } # index all firefox bookmarks function mozbm() { ### hardcoded - mozilla #cat "$HOME"/.mozilla/default/*/bookmarks.html | ### hardcoded - firefox cat "$HOME"/.mozilla/firefox/*.default/bookmarks.html | sed -n 's/.*&2 <