get-pg.sh
1 #!/bin/bash 2 3 function usage { 4 echo "usage: <n>$0" 5 echo "note: n is the number of essays to download" 6 echo "for specific n, the resulting pg.txt file will have the following number of tokens:" 7 echo "n | tokens" 8 echo "--- | ---" 9 echo "1 | 6230" 10 echo "2 | 23619" 11 echo "5 | 25859" 12 echo "10 | 36888" 13 echo "15 | 50188" 14 echo "20 | 59094" 15 echo "25 | 88764" 16 echo "30 | 103121" 17 echo "32 | 108338" 18 echo "35 | 113403" 19 echo "40 | 127699" 20 echo "45 | 135896" 21 exit 1 22 } 23 24 function has_cmd { 25 if ! [ -x "$(command -v $1)" ]; then 26 echo "error: $1 is not available" >&2 27 exit 1 28 fi 29 } 30 31 # check for: curl, html2text, tail, sed, fmt 32 has_cmd curl 33 has_cmd html2text 34 has_cmd tail 35 has_cmd sed 36 37 if [ $# -ne 1 ]; then 38 usage 39 fi 40 41 n=$1 42 43 # get urls 44 urls="$(curl http://www.aaronsw.com/2002/feeds/pgessays.rss | grep html | sed -e "s/.*http/http/" | sed -e "s/html.*/html/" | head -n $n)" 45 46 printf "urls:\n%s\n" "$urls" 47 48 if [ -f pg.txt ]; then 49 rm pg.txt 50 fi 51 52 c=1 53 for url in $urls; do 54 echo "processing $url" 55 56 cc=$(printf "%03d" $c) 57 58 curl -L $url | html2text | tail -n +4 | sed -E "s/^[[:space:]]+//g" | fmt -w 80 >> pg-$cc-one.txt 59 cat pg-$cc-one.txt >> pg.txt 60 61 cp -v pg.txt pg-$cc-all.txt 62 c=$((c+1)) 63 64 # don't flood the server 65 sleep 1 66 done 67 68 echo "done. data in pg.txt" 69 70 exit 0