#!/bin/bash #=============================================================================== # compare hits for words with google # # uses curl and iconv #=============================================================================== # config user_agent="Mozilla/4.0 (compatible; MSIE 5.5; Windows 98; Win 9x 4.90)" use_proxy="no" # proxy config #proxy_host="proxy.test.org" #proxy_port="8080" #proxy_user="me" #proxy_passwd="xxxxxxxx" # constants TAB=$'\t' #------------------------------------------------------------------------------- ### stolen from the web include # URL encode a stream or a string ### / wird nicht encoded! function url_encode() { # usage exit for too many parameters [ $# -gt 1 ] && { echo >&2 "usage: url_encode [string]"; return 1; } # self call when an argument is given, else handle stdin [ $# -eq 1 ] && { echo -n "$1" | url_encode; return $?; } # first create alternating lines of hex code and ascii characters # then remember the hex value, # convert spaces to +, # keep some selected characters unchanged # and use the hexvalue prefixed with a % for the rest ##od -t x1c -w1 -v -An | ##LANG=C awk ' ## NR % 2 { hex=$1; next } ## /^ *$/ { printf("%s", "+"); next } ## /^ *[a-zA-Z0-9.*()-]$/ { printf("%s", $1); next } ## /^ *\// { printf("%s", $1); next } ## { printf("%%%s", hex) } ##' hexdump -v -e '1/1 "%02x\t"' -e '1/1 "%_c\n"' | LANG=C awk ' $1 == "20" { printf("%s", "+"); next } $2 ~ /^[a-zA-Z0-9.*()\/-]$/ { printf("%s", $2); next } { printf("%%%s", $1) } ' } #------------------------------------------------------------------------------- # get hits for a word $1 function word_count() { local query="$*" # build query url local system_charset="" # "" is the local encoding local web_charset="utf-8" # google uses utf-8 local query=$( echo -n "$query" | iconv -f "$system_charset" -t "$web_charset" | url_encode ) local url="http://www.google.com/search?num=1&safe=off&hl=en&&q=$query" # download page case "$use_proxy" in y|Y|yes|Yes|Yes) curl -s -A "$user_agent" -x "$proxy_host:$proxy_port" -U "$proxy_user:$proxy_passwd" "$url";; *) curl -s -A "$user_agent" "$url";; esac | # filter out number of occurances sed -n 's!.*of about \(\([0-9]\|\,\)\+\).*!\1!;T;p' | head -n 1 | tr -d ',' #local url="http://www.google.de/search?num=1&q=$word" #sed -n 's!.*von ungef..hr \(\([0-9]\|\.\)\+\).*!\1!;T;p' | head -n 1 | tr -d '.' } # output hits for words in $@ sorted by number of hits function query_words() { local word for word; do count=$(word_count "$word") echo "$count$TAB$word" done | sort -n } # find out the most successful hits in $1 function find_winners() { local hits="$1" local highest=$(echo "$result" | tail -n 1 | sed "s/$TAB.*//") local winner=$( echo "$result" | grep "^$highest$TAB" | sed "s/.*$TAB//") echo "winner$TAB$winner" } #------------------------------------------------------------------------------- # no word given: usage exit [ $# -eq 0 ] && { echo >&2 "usage: $(basename $0) word .." exit 1 } # get hits result=$(query_words "$@") echo "$result" # print winner [ $# -gt 1 ] && find_winners "$result" #=============================================================================== #:mode=shellscript:noTabs=false:tabSize=4:indentSize=4:lineSeparator=\n: