#!/bin/bash #=============================================================================== # translate a word between english and german with dict.leo.org # # uses lynx, tidy, xmlstarlet and iconv #=============================================================================== #------------------------------------------------------------------------------- ## xterm colors # color => 4x would change the background BLACK=$'\e[30m' RED=$'\e[31m' GREEN=$'\e[32m' ORANGE=$'\e[33m' BLUE=$'\e[34m' MAGENTA=$'\e[35m' CYAN=$'\e[36m' GREY=$'\e[37m' # style RESET=$'\e[0m' NONE=$'\e[00m' BOLD=$'\e[01m' UNDERLINE=$'\e[04m' BLINK=$'\e[05m' INVERSE=$'\e[07m' CONCEALED=$'\e[08m' #------------------------------------------------------------------------------- ### stolen from the web include # URL encode a stream or a string ### / wird nicht encoded! function url_encode() { # usage exit for too many parameters [ $# -gt 1 ] && { echo >&2 "usage: url_encode [string]"; return 1; } # self call when an argument is given, else handle stdin [ $# -eq 1 ] && { echo -n "$1" | url_encode; return $?; } # first create alternating lines of hex code and ascii characters # then remember the hex value, # convert spaces to +, # keep some selected characters unchanged # and use the hexvalue prefixed with a % for the rest ##od -t x1c -w1 -v -An | ##LANG=C awk ' ## NR % 2 { hex=$1; next } ## /^ *$/ { printf("%s", "+"); next } ## /^ *[a-zA-Z0-9.*()-]$/ { printf("%s", $1); next } ## /^ *\// { printf("%s", $1); next } ## { printf("%%%s", hex) } ##' hexdump -v -e '1/1 "%02x\t"' -e '1/1 "%_c\n"' | LANG=C awk ' $1 == "20" { printf("%s", "+"); next } $2 ~ /^[a-zA-Z0-9.*()\/-]$/ { printf("%s", $2); next } { printf("%%%s", $1) } ' } #------------------------------------------------------------------------------- # get the leo page for a query and output it as valid utf-8 xhtml fetch() { # get argument(s) local query="$*" # the charset coming from the console local system_charset="" # the charset leo uses local web_charset="iso-8859-15" # url-encode the query string local encoded=$(echo -n "$query" | iconv -f "$system_charset" -t "$web_charset" | url_encode) # additional args from global variables local args="?search=$encoded&searchLoc=$searchLoc&lp=$lp&spellToler=$spellToler&deStem=$deStem&cmpType=$cmpType" # make the search URL local url="http://dict.leo.org/$args" # get the HTML page from leo lynx -source "$url" | # convert from leos charset into to utf-8 iconv -f "$web_charset" -t "utf-8" | # convert HTML 4.0 transitional to XHTML tidy -asxhtml -utf8 -indent -wrap 1024 2>/dev/null } # extract the content table from the leo xhtml extract() { local xmlstarlet="xmlstarlet" which >/dev/null "$xmlstarlet" || xmlstarlet="xml" which >/dev/null "$xmlstarlet" || { echo >&2 "xmlstarlet not installed"; return 1; } ### HACK: used as an escape for near-tag spaces local esc="\\§" # prepare xml code for xmlstarlet and filter out form containing translations local table=$( ### HACK: remove xml namespace, otherwise xmlstarlet fails sed 's###' | ### HACK: exchange entities causing parser errors with their numerical equivalent sed 's/ /\ /g' | sed 's/–/\–/g' | # retrieve the form containing translations $xmlstarlet sel -t -c "//form[@id='WORDS']/table" ) # abort without results [ -z "$table" ] && { echo "
no search results" return; } # begin html page echo "" # mangle table echo "$table" | ### HACK: escape spaces near tags as $esc to prevent ed -d from removing them sed 's/ '"$esc"' />'"$esc"'/g' | # remove unwanted tags $xmlstarlet ed -d "//img" | # remove first table row conatining language names $xmlstarlet ed -d "//tr[position()=1]" | # remove all attributes from tables and rows $xmlstarlet ed -d "//table/@*" | $xmlstarlet ed -d "//tr/@*" | # remove all column attributes except colspan $xmlstarlet ed -d "//td/@width" | $xmlstarlet ed -d "//td/@align" | $xmlstarlet ed -d "//td/@valign" | $xmlstarlet ed -d "//td/@nowrap" | $xmlstarlet ed -d "//td/@class" | # replace link tags by their content $xmlstarlet ed -d "//a/@*" | sed 's#\|\|##g' | # remove nbsp at the start of td colspan=5 sed 's/