mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	scripts : improve get-pg.sh (#4838)
This commit is contained in:
		| @@ -2,6 +2,22 @@ | ||||
|  | ||||
| function usage { | ||||
|     echo "usage: <n>$0" | ||||
|     echo "note: n is the number of essays to download" | ||||
|     echo "for specific n, the resulting pg.txt file will have the following number of tokens:" | ||||
|     echo "n   | tokens" | ||||
|     echo "--- | ---" | ||||
|     echo "1   | 6230" | ||||
|     echo "2   | 23619" | ||||
|     echo "5   | 25859" | ||||
|     echo "10  | 36888" | ||||
|     echo "15  | 50188" | ||||
|     echo "20  | 59094" | ||||
|     echo "25  | 88764" | ||||
|     echo "30  | 103121" | ||||
|     echo "32  | 108338" | ||||
|     echo "35  | 113403" | ||||
|     echo "40  | 127699" | ||||
|     echo "45  | 135896" | ||||
|     exit 1 | ||||
| } | ||||
|  | ||||
| @@ -33,10 +49,17 @@ if [ -f pg.txt ]; then | ||||
|     rm pg.txt | ||||
| fi | ||||
|  | ||||
| c=1 | ||||
| for url in $urls; do | ||||
|     echo "processing $url" | ||||
|  | ||||
|     curl -L $url | html2text | tail -n +4 | sed -E "s/^[[:space:]]+//g" | fmt -w 80 >> pg.txt | ||||
|     cc=$(printf "%03d" $c) | ||||
|  | ||||
|     curl -L $url | html2text | tail -n +4 | sed -E "s/^[[:space:]]+//g" | fmt -w 80 >> pg-$cc-one.txt | ||||
|     cat pg-$cc-one.txt >> pg.txt | ||||
|  | ||||
|     cp -v pg.txt pg-$cc-all.txt | ||||
|     c=$((c+1)) | ||||
|  | ||||
|     # don't flood the server | ||||
|     sleep 1 | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov