#!/bin/bash
#===============================================================================
# compare hits for words with google
#
# uses curl and iconv
#===============================================================================
# config
user_agent="Mozilla/4.0 (compatible; MSIE 5.5; Windows 98; Win 9x 4.90)"
use_proxy="no"
# proxy config
#proxy_host="proxy.test.org"
#proxy_port="8080"
#proxy_user="me"
#proxy_passwd="xxxxxxxx"
# constants
TAB=$'\t'
#-------------------------------------------------------------------------------
### stolen from the web include
# URL encode a stream or a string ### / wird nicht encoded!
function url_encode() {
# usage exit for too many parameters
[ $# -gt 1 ] && { echo >&2 "usage: url_encode [string]"; return 1; }
# self call when an argument is given, else handle stdin
[ $# -eq 1 ] && { echo -n "$1" | url_encode; return $?; }
# first create alternating lines of hex code and ascii characters
# then remember the hex value,
# convert spaces to +,
# keep some selected characters unchanged
# and use the hexvalue prefixed with a % for the rest
##od -t x1c -w1 -v -An |
##LANG=C awk '
## NR % 2 { hex=$1; next }
## /^ *$/ { printf("%s", "+"); next }
## /^ *[a-zA-Z0-9.*()-]$/ { printf("%s", $1); next }
## /^ *\// { printf("%s", $1); next }
## { printf("%%%s", hex) }
##'
hexdump -v -e '1/1 "%02x\t"' -e '1/1 "%_c\n"' |
LANG=C awk '
$1 == "20" { printf("%s", "+"); next }
$2 ~ /^[a-zA-Z0-9.*()\/-]$/ { printf("%s", $2); next }
{ printf("%%%s", $1) }
'
}
#-------------------------------------------------------------------------------
# get hits for a word $1
function word_count() {
local query="$*"
# build query url
local system_charset="" # "" is the local encoding
local web_charset="utf-8" # google uses utf-8
local query=$(
echo -n "$query" |
iconv -f "$system_charset" -t "$web_charset" |
url_encode
)
local url="http://www.google.com/search?num=1&safe=off&hl=en&&q=$query"
# download page
case "$use_proxy" in
y|Y|yes|Yes|Yes) curl -s -A "$user_agent" -x "$proxy_host:$proxy_port" -U "$proxy_user:$proxy_passwd" "$url";;
*) curl -s -A "$user_agent" "$url";;
esac |
# filter out number of occurances
sed -n 's!.*of about \(\([0-9]\|\,\)\+\).*!\1!;T;p' |
head -n 1 | tr -d ','
#local url="http://www.google.de/search?num=1&q=$word"
#sed -n 's!.*von ungef..hr \(\([0-9]\|\.\)\+\).*!\1!;T;p' | head -n 1 | tr -d '.'
}
# output hits for words in $@ sorted by number of hits
function query_words() {
local word
for word; do
count=$(word_count "$word")
echo "$count$TAB$word"
done |
sort -n
}
# find out the most successful hits in $1
function find_winners() {
local hits="$1"
local highest=$(echo "$result" | tail -n 1 | sed "s/$TAB.*//")
local winner=$( echo "$result" | grep "^$highest$TAB" | sed "s/.*$TAB//")
echo "winner$TAB$winner"
}
#-------------------------------------------------------------------------------
# no word given: usage exit
[ $# -eq 0 ] && {
echo >&2 "usage: $(basename $0) word .."
exit 1
}
# get hits
result=$(query_words "$@")
echo "$result"
# print winner
[ $# -gt 1 ] && find_winners "$result"
#===============================================================================
#:mode=shellscript:noTabs=false:tabSize=4:indentSize=4:lineSeparator=\n: