summaryrefslogtreecommitdiff
path: root/llama.cpp/scripts/get-pg.sh
diff options
context:
space:
mode:
Diffstat (limited to 'llama.cpp/scripts/get-pg.sh')
-rwxr-xr-xllama.cpp/scripts/get-pg.sh70
1 files changed, 70 insertions, 0 deletions
diff --git a/llama.cpp/scripts/get-pg.sh b/llama.cpp/scripts/get-pg.sh
new file mode 100755
index 0000000..f180bf8
--- /dev/null
+++ b/llama.cpp/scripts/get-pg.sh
@@ -0,0 +1,70 @@
+#!/usr/bin/env bash
+
+function usage {
+ echo "usage: <n>$0"
+ echo "note: n is the number of essays to download"
+ echo "for specific n, the resulting pg.txt file will have the following number of tokens:"
+ echo "n | tokens"
+ echo "--- | ---"
+ echo "1 | 6230"
+ echo "2 | 23619"
+ echo "5 | 25859"
+ echo "10 | 36888"
+ echo "15 | 50188"
+ echo "20 | 59094"
+ echo "25 | 88764"
+ echo "30 | 103121"
+ echo "32 | 108338"
+ echo "35 | 113403"
+ echo "40 | 127699"
+ echo "45 | 135896"
+ exit 1
+}
+
+function has_cmd {
+ if ! [ -x "$(command -v $1)" ]; then
+ echo "error: $1 is not available" >&2
+ exit 1
+ fi
+}
+
+# check for: curl, html2text, tail, sed, fmt
+has_cmd curl
+has_cmd html2text
+has_cmd tail
+has_cmd sed
+
+if [ $# -ne 1 ]; then
+ usage
+fi
+
+n=$1
+
+# get urls
+urls="$(curl http://www.aaronsw.com/2002/feeds/pgessays.rss | grep html | sed -e "s/.*http/http/" | sed -e "s/html.*/html/" | head -n $n)"
+
+printf "urls:\n%s\n" "$urls"
+
+if [ -f pg.txt ]; then
+ rm pg.txt
+fi
+
+c=1
+for url in $urls; do
+ echo "processing $url"
+
+ cc=$(printf "%03d" $c)
+
+ curl -L $url | html2text | tail -n +4 | sed -E "s/^[[:space:]]+//g" | fmt -w 80 >> pg-$cc-one.txt
+ cat pg-$cc-one.txt >> pg.txt
+
+ cp -v pg.txt pg-$cc-all.txt
+ c=$((c+1))
+
+ # don't flood the server
+ sleep 1
+done
+
+echo "done. data in pg.txt"
+
+exit 0