#! /bin/csh # # This does a poor job of processing a set of manpage listed in # the file "manpages". It finds the 40 most commonly occuring # words in the file, excluding some "stop words" given in the # egrep commands. The resulting file has a suffix .doc, and is # sorted into alphabetical order - but has the word counts as # well. # # foreach f (`cat manpages`) echo $f man $f \ | col -b \ | tr -cs 'A-Za-z' '\012' \ | tr 'A-Z' 'a-z' \ | egrep '^..*$' \ | egrep -v '^([a-z]|the|or|of|it|above|below|after|an|and)$'\ | egrep -v '^(on|not|that|used|use|see|then)$'\ | egrep -v '^(may|with|as|are|but|does|one|have|no)$'\ | egrep -v '^(in|if|to|from|is|be|by|use|do|like|only|you)$'\ | sort | uniq -c | sort -n | tail -40 | sort -k 2 > $f.doc end