/ scripts / get-pg.sh
get-pg.sh
 1  #!/bin/bash
 2  
 3  function usage {
 4      echo "usage: <n>$0"
 5      echo "note: n is the number of essays to download"
 6      echo "for specific n, the resulting pg.txt file will have the following number of tokens:"
 7      echo "n   | tokens"
 8      echo "--- | ---"
 9      echo "1   | 6230"
10      echo "2   | 23619"
11      echo "5   | 25859"
12      echo "10  | 36888"
13      echo "15  | 50188"
14      echo "20  | 59094"
15      echo "25  | 88764"
16      echo "30  | 103121"
17      echo "32  | 108338"
18      echo "35  | 113403"
19      echo "40  | 127699"
20      echo "45  | 135896"
21      exit 1
22  }
23  
24  function has_cmd {
25      if ! [ -x "$(command -v $1)" ]; then
26          echo "error: $1 is not available" >&2
27          exit 1
28      fi
29  }
30  
31  # check for: curl, html2text, tail, sed, fmt
32  has_cmd curl
33  has_cmd html2text
34  has_cmd tail
35  has_cmd sed
36  
37  if [ $# -ne 1 ]; then
38      usage
39  fi
40  
41  n=$1
42  
43  # get urls
44  urls="$(curl http://www.aaronsw.com/2002/feeds/pgessays.rss | grep html | sed -e "s/.*http/http/" | sed -e "s/html.*/html/" | head -n $n)"
45  
46  printf "urls:\n%s\n" "$urls"
47  
48  if [ -f pg.txt ]; then
49      rm pg.txt
50  fi
51  
52  c=1
53  for url in $urls; do
54      echo "processing $url"
55  
56      cc=$(printf "%03d" $c)
57  
58      curl -L $url | html2text | tail -n +4 | sed -E "s/^[[:space:]]+//g" | fmt -w 80 >> pg-$cc-one.txt
59      cat pg-$cc-one.txt >> pg.txt
60  
61      cp -v pg.txt pg-$cc-all.txt
62      c=$((c+1))
63  
64      # don't flood the server
65      sleep 1
66  done
67  
68  echo "done. data in pg.txt"
69  
70  exit 0