mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-02 09:12:03 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			71 lines
		
	
	
		
			1.4 KiB
		
	
	
	
		
			Bash
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			71 lines
		
	
	
		
			1.4 KiB
		
	
	
	
		
			Bash
		
	
	
		
			Executable File
		
	
	
	
	
#!/usr/bin/env bash
 | 
						|
 | 
						|
function usage {
 | 
						|
    echo "usage: <n>$0"
 | 
						|
    echo "note: n is the number of essays to download"
 | 
						|
    echo "for specific n, the resulting pg.txt file will have the following number of tokens:"
 | 
						|
    echo "n   | tokens"
 | 
						|
    echo "--- | ---"
 | 
						|
    echo "1   | 6230"
 | 
						|
    echo "2   | 23619"
 | 
						|
    echo "5   | 25859"
 | 
						|
    echo "10  | 36888"
 | 
						|
    echo "15  | 50188"
 | 
						|
    echo "20  | 59094"
 | 
						|
    echo "25  | 88764"
 | 
						|
    echo "30  | 103121"
 | 
						|
    echo "32  | 108338"
 | 
						|
    echo "35  | 113403"
 | 
						|
    echo "40  | 127699"
 | 
						|
    echo "45  | 135896"
 | 
						|
    exit 1
 | 
						|
}
 | 
						|
 | 
						|
function has_cmd {
 | 
						|
    if ! [ -x "$(command -v $1)" ]; then
 | 
						|
        echo "error: $1 is not available" >&2
 | 
						|
        exit 1
 | 
						|
    fi
 | 
						|
}
 | 
						|
 | 
						|
# check for: curl, html2text, tail, sed, fmt
 | 
						|
has_cmd curl
 | 
						|
has_cmd html2text
 | 
						|
has_cmd tail
 | 
						|
has_cmd sed
 | 
						|
 | 
						|
if [ $# -ne 1 ]; then
 | 
						|
    usage
 | 
						|
fi
 | 
						|
 | 
						|
n=$1
 | 
						|
 | 
						|
# get urls
 | 
						|
urls="$(curl http://www.aaronsw.com/2002/feeds/pgessays.rss | grep html | sed -e "s/.*http/http/" | sed -e "s/html.*/html/" | head -n $n)"
 | 
						|
 | 
						|
printf "urls:\n%s\n" "$urls"
 | 
						|
 | 
						|
if [ -f pg.txt ]; then
 | 
						|
    rm pg.txt
 | 
						|
fi
 | 
						|
 | 
						|
c=1
 | 
						|
for url in $urls; do
 | 
						|
    echo "processing $url"
 | 
						|
 | 
						|
    cc=$(printf "%03d" $c)
 | 
						|
 | 
						|
    curl -L $url | html2text | tail -n +4 | sed -E "s/^[[:space:]]+//g" | fmt -w 80 >> pg-$cc-one.txt
 | 
						|
    cat pg-$cc-one.txt >> pg.txt
 | 
						|
 | 
						|
    cp -v pg.txt pg-$cc-all.txt
 | 
						|
    c=$((c+1))
 | 
						|
 | 
						|
    # don't flood the server
 | 
						|
    sleep 1
 | 
						|
done
 | 
						|
 | 
						|
echo "done. data in pg.txt"
 | 
						|
 | 
						|
exit 0
 |