mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	examples : server chat mode with llama2 (#2400)
* add: server chat mode with llama2 * fix: remove the unnecessary last \n
This commit is contained in:
		
							
								
								
									
										26
									
								
								examples/server-llama2-13B.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										26
									
								
								examples/server-llama2-13B.sh
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,26 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| set -e | ||||
|  | ||||
| cd "$(dirname "$0")/.." || exit | ||||
|  | ||||
| # Specify the model you want to use here: | ||||
| MODEL="${MODEL:-./models/llama-2-13b-chat.ggmlv3.q5_K_M.bin}" | ||||
| PROMPT_TEMPLATE=${PROMPT_TEMPLATE:-./prompts/chat-system.txt} | ||||
|  | ||||
| # Adjust to the number of CPU cores you want to use. | ||||
| N_THREAD="${N_THREAD:-12}" | ||||
|  | ||||
| # Note: you can also override the generation options by specifying them on the command line: | ||||
| GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 4096 --batch-size 1024}" | ||||
|  | ||||
|  | ||||
| # shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS | ||||
| ./server $GEN_OPTIONS \ | ||||
|   --model "$MODEL" \ | ||||
|   --threads "$N_THREAD" \ | ||||
|   --rope-freq-scale 1.0 \ | ||||
|   "$@" | ||||
|    | ||||
|   # I used this to test the model with mps, but omitted it from the general purpose. If you want to use it, just specify it on the command line. | ||||
|   # -ngl 1 \  | ||||
							
								
								
									
										109
									
								
								examples/server/chat-llama2.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										109
									
								
								examples/server/chat-llama2.sh
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,109 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| API_URL="${API_URL:-http://127.0.0.1:8080}" | ||||
|  | ||||
| CHAT=( | ||||
|     "Hello, Assistant." | ||||
|     "Hello. How may I help you today?" | ||||
| ) | ||||
|  | ||||
| INSTRUCTION="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions." | ||||
|  | ||||
| trim() { | ||||
|     shopt -s extglob | ||||
|     set -- "${1##+([[:space:]])}" | ||||
|     printf "%s" "${1%%+([[:space:]])}" | ||||
| } | ||||
|  | ||||
| trim_trailing() { | ||||
|     shopt -s extglob | ||||
|     printf "%s" "${1%%+([[:space:]])}" | ||||
| } | ||||
|  | ||||
| format_prompt() { | ||||
|     if [[ "${#CHAT[@]}" -eq 0 ]]; then | ||||
|         echo -n "[INST] <<SYS>>\n${INSTRUCTION}\n<</SYS>>" | ||||
|     else | ||||
|         LAST_INDEX=$(( ${#CHAT[@]} - 1 )) | ||||
|         echo -n "${CHAT[$LAST_INDEX]}\n[INST] $1 [/INST]" | ||||
|     fi | ||||
| } | ||||
|  | ||||
| tokenize() { | ||||
|     curl \ | ||||
|         --silent \ | ||||
|         --request POST \ | ||||
|         --url "${API_URL}/tokenize" \ | ||||
|         --header "Content-Type: application/json" \ | ||||
|         --data-raw "$(jq -ns --arg content "$1" '{content:$content}')" \ | ||||
|     | jq '.tokens[]' | ||||
| } | ||||
|  | ||||
| N_KEEP=$(tokenize "[INST] <<SYS>>\n${INSTRUCTION}\n<</SYS>>" | wc -l) | ||||
|  | ||||
| chat_completion() { | ||||
|     PROMPT="$(trim_trailing "$(format_prompt "$1")")" | ||||
|     DATA="$(echo -n "$PROMPT" | jq -Rs --argjson n_keep $N_KEEP '{ | ||||
|         prompt: ., | ||||
|         temperature: 0.2, | ||||
|         top_k: 40, | ||||
|         top_p: 0.9, | ||||
|         n_keep: $n_keep, | ||||
|         n_predict: 1024, | ||||
|         stop: ["[INST]"], | ||||
|         stream: true | ||||
|     }')" | ||||
|  | ||||
|     # Create a temporary file to hold the Python output | ||||
|     TEMPFILE=$(mktemp) | ||||
|  | ||||
|     exec 3< <(curl \ | ||||
|         --silent \ | ||||
|         --no-buffer \ | ||||
|         --request POST \ | ||||
|         --url "${API_URL}/completion" \ | ||||
|         --header "Content-Type: application/json" \ | ||||
|         --data-raw "${DATA}") | ||||
|  | ||||
|     python -c " | ||||
| import json | ||||
| import sys | ||||
|  | ||||
| answer = '' | ||||
| while True: | ||||
|     line = sys.stdin.readline() | ||||
|     if not line: | ||||
|         break | ||||
|     if line.startswith('data: '): | ||||
|         json_content = line[6:].strip() | ||||
|         content = json.loads(json_content)['content'] | ||||
|         sys.stdout.write(content) | ||||
|         sys.stdout.flush() | ||||
|         answer += content | ||||
|  | ||||
| answer = answer.rstrip('\n') | ||||
|  | ||||
| # Write the answer to the temporary file | ||||
| with open('$TEMPFILE', 'w') as f: | ||||
|     f.write(answer) | ||||
|     " <&3 | ||||
|  | ||||
|     exec 3<&- | ||||
|  | ||||
|     # Read the answer from the temporary file | ||||
|     ANSWER=$(cat $TEMPFILE) | ||||
|  | ||||
|     # Clean up the temporary file | ||||
|     rm $TEMPFILE | ||||
|  | ||||
|     printf "\n" | ||||
|  | ||||
|     CHAT+=("$1" "$(trim "$ANSWER")") | ||||
| } | ||||
|  | ||||
| while true; do | ||||
|     echo -en "\033[0;32m"  # Green color | ||||
|     read -r -e -p "> " QUESTION | ||||
|     echo -en "\033[0m"  # Reset color | ||||
|     chat_completion "${QUESTION}" | ||||
| done | ||||
		Reference in New Issue
	
	Block a user
	 nhamanasu
					nhamanasu