#!/bin/bash
#===============================================================================
# keeps a list of URLs with a description
# and makes their contents searchable with an index
#
# uses curl
#===============================================================================
# config
data="$HOME/.lix"
timeout=120
stopwords="
http
www
com
org
de
nbsp
not
and
or
of
to
in
on
by
at
as
from
for
do
is
be
are
can
if
an
it
so
no
we
the
this
that
with
html"
# constants
tab=$'\t'
linkfile="$data/links.txt"
indexfile="$data/index.txt"
# init data directories
mkdir -p "$data"
touch "$linkfile"
touch "$indexfile"
#-------------------------------------------------------------------------------
## helper
# print out user info
function status() {
echo >&2 "$@"
}
# awk using TAB as a record separator
function tawk() {
awk -v FS=$'\t' -v OFS=$'\t' "$@"
}
# fetch contents of an url
function fetch() {
local url="$1"
curl -m $timeout -s "$url" # -f fails silently
}
# extracts words from a string
function decompose() {
# no linefeeds
tr '\n' ' ' | tr '\r' ' ' |
# no tags
sed 's/<[^>]*>/ /g' |
# only alphanum and underscore allowed
sed 's/[^a-zA-Z0-9äöüßÄÖÜ_]/ /g' |
# lowercase everything
tr '[:upper:]' '[:lower:]' |
# no unnecessary whitespace
sed 's/ */ /g;s/^ //;s/ $//' |
# one word per line
tr ' ' '\n' |
# minimum 2 characters
grep '..' |
# no stopwords
grep -vxF "$stopwords" |
# no duplicates
sort -u
}
# get the title from a html file
title() {
local fulltext="$1"
echo "$fulltext" |
awk '
index($0, "
") { out=1 }
out { print }
index($0, "") { out=0 }
' |
tr '\n' ' ' | tr '\r' ' ' |
sed 's#.*\(.*\).*#\1#' |
sed 's/ */ /g;s/^ //;s/ $//'
}
#-------------------------------------------------------------------------------
## list stuff
# add entry to the list
function list_add() {
local url="$1"
local description="$2"
local title="$3"
local fulltext="$4"
local date="$5"
echo >>"$linkfile" "$url$tab$title$tab$description$tab$date"
}
# get entries from the list
function list_get() {
[ $# != 0 ] && { echo "$@" | list_get; return $?; }
### files would have to be sorted
#join -t "$tab" "$linkfile" -
while read -r url; do
tawk -v url="$url" '$1 == url { print }' <"$linkfile"
done
}
# remove an url from the list
function list_remove() {
#[ $# != 0 ] && { echo "$@" | list_remove; return $?; }
### files would have to be sorted
#join -t "$tab" -v 1 "$linkfile" - >"$linkfile.tmp"
#mv "$linkfile.tmp" "$linkfile"
while read -r url; do
tawk -v url="$url" '$1 != url { print }' <"$linkfile" >"$linkfile.tmp"
mv "$linkfile.tmp" "$linkfile"
done
}
# returns true when the url exists in the list
function list_contains() {
local url="$1"
[ -n "$(list_get "$url")" ]
}
#-------------------------------------------------------------------------------
## index stuff
# add entry to the index
function index_add() {
local url="$1"
local description="$2"
local title="$3"
local fulltext="$4"
local date="$5"
local word map
( echo "$url"
echo "$title"
echo "$description"
echo "$fulltext"
) |
decompose |
while read -r word; do
echo "$word$tab$url" >>"$indexfile"
done
}
# remove urls from the index
function index_remove() {
#[ $# != 0 ] && { echo "$@" | index_remove; return $?; }
### files would have to be sorted
#join -t "$tab" -v 1 -1 2 -2 1 "$indexfile" - >"$indexfile.tmp"
#mv "$indexfile.tmp" "$indexfile"
while read -r url; do
tawk -v url="$url" '$2 != url { print }' <"$indexfile" >"$indexfile.tmp"
mv "$indexfile.tmp" "$indexfile"
done
}
# create a histogramm of all words in the index
function index_histogramm() {
cat "$indexfile" |
tawk '{print $1}' |
sort |
uniq -c |
sort -n |
sed 's/^ *\([0-9]*\) */\1\t/'
}
# find urls in the index
function index_search() {
local p_count=0
for word; do
local short="${word#-}"
[ "$word" == "$short" ] || continue
index_search_word "$short" || { status "excluded $short"; continue; }
((p_count++))
done >"$data/p_list"
#local n_count=0
for word; do
local short="${word#-}"
[ "$word" != "$short" ] || continue
index_search_word "$short" || { status "excluded $short"; continue; }
#((n_count++))
done >"$data/n_list"
# get urls found for all +words and not found for any -word
comm -2 -3 <(
# all files occuring p_count times in p_list
cat "$data/p_list" |
sort |
uniq -c |
grep "^ *$p_count *" |
sed 's/^ *[0-9]* *//'
) <(
# all files occuring more than 0 times in n_list
cat "$data/n_list" |
sort -u
)
# cleanup
# rm "$data/n_list" "$data/p_list"
}
# find a single word in the index (output is sorted)
function index_search_word() {
local word=$(
echo "$1" | tr '[:upper:]' '[:lower:]'
)
local urls=$(
tawk -v word="$word" 'index($1, word) { print $2 }' <"$indexfile" |
sort -u
)
echo "$urls"
[ -n "$urls" ]
}
#-------------------------------------------------------------------------------
## public functions
# add a link to the index
function add() {
local description=""
[ "$1" = "-d" ] && { description="$2"; shift 2; }
[ $# -eq 0 ] && { while read -r url; do add -d "$description" "$url"; done; }
for url; do
### BÄH - slow!
list_contains "$url" && { status "skipping already indexed $url"; continue; }
status "indexing $url"
local fulltext=$(fetch "$url") || { status "cannot download $url"; continue; }
local title=$(title "$fulltext")
local date=$(date +%s)
list_add "$url" "$description" "$title" "$fulltext" "$date"
index_add "$url" "$description" "$title" "$fulltext" "$date"
done
}
# remove a link from the index
function remove() {
[ $# -eq 0 ] && { while read -r url; do add -d "$description"; done; }
for url; do
status "deindexing $url"
index_remove "$url"
list_remove "$url"
done
}
# search links by words from the index
function search() {
[ $# -eq 0 ] && { status "error: wrong number of arguments"; return 1; }
index_search "$@" | list_get
#| tawk '{print $1 "\n" $3 "\n"}'
}
# index all firefox bookmarks
function mozbm() {
### hardcoded - mozilla
#cat "$HOME"/.mozilla/default/*/bookmarks.html |
### hardcoded - firefox
cat "$HOME"/.mozilla/firefox/*.default/bookmarks.html |
sed -n 's/.*&2 <