llmnpc - llama.cpp/scripts/get-pg.sh

Path: llmnpc / llama.cpp / scripts / get-pg.sh (raw)
 1#!/usr/bin/env bash
 2
 3function usage {
 4    echo "usage: <n>$0"
 5    echo "note: n is the number of essays to download"
 6    echo "for specific n, the resulting pg.txt file will have the following number of tokens:"
 7    echo "n   | tokens"
 8    echo "--- | ---"
 9    echo "1   | 6230"
10    echo "2   | 23619"
11    echo "5   | 25859"
12    echo "10  | 36888"
13    echo "15  | 50188"
14    echo "20  | 59094"
15    echo "25  | 88764"
16    echo "30  | 103121"
17    echo "32  | 108338"
18    echo "35  | 113403"
19    echo "40  | 127699"
20    echo "45  | 135896"
21    exit 1
22}
23
24function has_cmd {
25    if ! [ -x "$(command -v $1)" ]; then
26        echo "error: $1 is not available" >&2
27        exit 1
28    fi
29}
30
31# check for: curl, html2text, tail, sed, fmt
32has_cmd curl
33has_cmd html2text
34has_cmd tail
35has_cmd sed
36
37if [ $# -ne 1 ]; then
38    usage
39fi
40
41n=$1
42
43# get urls
44urls="$(curl http://www.aaronsw.com/2002/feeds/pgessays.rss | grep html | sed -e "s/.*http/http/" | sed -e "s/html.*/html/" | head -n $n)"
45
46printf "urls:\n%s\n" "$urls"
47
48if [ -f pg.txt ]; then
49    rm pg.txt
50fi
51
52c=1
53for url in $urls; do
54    echo "processing $url"
55
56    cc=$(printf "%03d" $c)
57
58    curl -L $url | html2text | tail -n +4 | sed -E "s/^[[:space:]]+//g" | fmt -w 80 >> pg-$cc-one.txt
59    cat pg-$cc-one.txt >> pg.txt
60
61    cp -v pg.txt pg-$cc-all.txt
62    c=$((c+1))
63
64    # don't flood the server
65    sleep 1
66done
67
68echo "done. data in pg.txt"
69
70exit 0