1#!/usr/bin/env bash
2
3function usage {
4 echo "usage: <n>$0"
5 echo "note: n is the number of essays to download"
6 echo "for specific n, the resulting pg.txt file will have the following number of tokens:"
7 echo "n | tokens"
8 echo "--- | ---"
9 echo "1 | 6230"
10 echo "2 | 23619"
11 echo "5 | 25859"
12 echo "10 | 36888"
13 echo "15 | 50188"
14 echo "20 | 59094"
15 echo "25 | 88764"
16 echo "30 | 103121"
17 echo "32 | 108338"
18 echo "35 | 113403"
19 echo "40 | 127699"
20 echo "45 | 135896"
21 exit 1
22}
23
24function has_cmd {
25 if ! [ -x "$(command -v $1)" ]; then
26 echo "error: $1 is not available" >&2
27 exit 1
28 fi
29}
30
31# check for: curl, html2text, tail, sed, fmt
32has_cmd curl
33has_cmd html2text
34has_cmd tail
35has_cmd sed
36
37if [ $# -ne 1 ]; then
38 usage
39fi
40
41n=$1
42
43# get urls
44urls="$(curl http://www.aaronsw.com/2002/feeds/pgessays.rss | grep html | sed -e "s/.*http/http/" | sed -e "s/html.*/html/" | head -n $n)"
45
46printf "urls:\n%s\n" "$urls"
47
48if [ -f pg.txt ]; then
49 rm pg.txt
50fi
51
52c=1
53for url in $urls; do
54 echo "processing $url"
55
56 cc=$(printf "%03d" $c)
57
58 curl -L $url | html2text | tail -n +4 | sed -E "s/^[[:space:]]+//g" | fmt -w 80 >> pg-$cc-one.txt
59 cat pg-$cc-one.txt >> pg.txt
60
61 cp -v pg.txt pg-$cc-all.txt
62 c=$((c+1))
63
64 # don't flood the server
65 sleep 1
66done
67
68echo "done. data in pg.txt"
69
70exit 0