mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	scripts : add server-llm.sh (#3868)
* scripts : add deploy-server.sh * scripts : rename to server-llm.sh * scripts : working curl pipe
This commit is contained in:
		
							
								
								
									
										391
									
								
								scripts/server-llm.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										391
									
								
								scripts/server-llm.sh
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,391 @@ | |||||||
|  | #!/bin/bash | ||||||
|  | # | ||||||
|  | # Helper script for deploying llama.cpp server with a single Bash command | ||||||
|  | # | ||||||
|  | # - Works on Linux and macOS | ||||||
|  | # - Supports: CPU, CUDA, Metal, OpenCL | ||||||
|  | # - Can run all GGUF models from HuggingFace | ||||||
|  | # - Can serve requests in parallel | ||||||
|  | # - Always builds latest llama.cpp from GitHub | ||||||
|  | # | ||||||
|  | # Limitations | ||||||
|  | # | ||||||
|  | # - Chat templates are poorly supported (base models recommended) | ||||||
|  | # - Might be unstable! | ||||||
|  | # | ||||||
|  | # Usage: | ||||||
|  | #   ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose] | ||||||
|  | # | ||||||
|  | #   --port:       port number, default is 8888 | ||||||
|  | #   --repo:       path to a repo containing GGUF model files | ||||||
|  | #   --wtype:      weights type (f16, q8_0, q4_0, q4_1), default is user-input | ||||||
|  | #   --backend:    cpu, cuda, metal, opencl, depends on the OS | ||||||
|  | #   --gpu-id:     gpu id, default is 0 | ||||||
|  | #   --n-parallel: number of parallel requests, default is 8 | ||||||
|  | #   --n-kv:       KV cache size, default is 4096 | ||||||
|  | #   --verbose:    verbose output | ||||||
|  | # | ||||||
|  | # Example: | ||||||
|  | # | ||||||
|  | #   bash -c "$(curl -s https://ggml.ai/server-llm.sh)" | ||||||
|  | # | ||||||
|  |  | ||||||
|  | set -e | ||||||
|  |  | ||||||
|  | # required utils: curl, git, make | ||||||
|  | if ! command -v curl &> /dev/null; then | ||||||
|  |     printf "[-] curl not found\n" | ||||||
|  |     exit 1 | ||||||
|  | fi | ||||||
|  | if ! command -v git &> /dev/null; then | ||||||
|  |     printf "[-] git not found\n" | ||||||
|  |     exit 1 | ||||||
|  | fi | ||||||
|  | if ! command -v make &> /dev/null; then | ||||||
|  |     printf "[-] make not found\n" | ||||||
|  |     exit 1 | ||||||
|  | fi | ||||||
|  |  | ||||||
|  | # parse arguments | ||||||
|  | port=8888 | ||||||
|  | repo="" | ||||||
|  | wtype="" | ||||||
|  | backend="cpu" | ||||||
|  |  | ||||||
|  | # if macOS, use metal backend by default | ||||||
|  | if [[ "$OSTYPE" == "darwin"* ]]; then | ||||||
|  |     backend="metal" | ||||||
|  | elif command -v nvcc &> /dev/null; then | ||||||
|  |     backend="cuda" | ||||||
|  | fi | ||||||
|  |  | ||||||
|  | gpu_id=0 | ||||||
|  | n_parallel=8 | ||||||
|  | n_kv=4096 | ||||||
|  | verbose=0 | ||||||
|  |  | ||||||
|  | function print_usage { | ||||||
|  |     printf "Usage:\n" | ||||||
|  |     printf "  ./server-llm.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose]\n\n" | ||||||
|  |     printf "  --port:       port number, default is 8888\n" | ||||||
|  |     printf "  --repo:       path to a repo containing GGUF model files\n" | ||||||
|  |     printf "  --wtype:      weights type (f16, q8_0, q4_0, q4_1), default is user-input\n" | ||||||
|  |     printf "  --backend:    cpu, cuda, metal, opencl, depends on the OS\n" | ||||||
|  |     printf "  --gpu-id:     gpu id, default is 0\n" | ||||||
|  |     printf "  --n-parallel: number of parallel requests, default is 8\n" | ||||||
|  |     printf "  --n-kv:       KV cache size, default is 4096\n" | ||||||
|  |     printf "  --verbose:    verbose output\n\n" | ||||||
|  |     printf "Example:\n\n" | ||||||
|  |     printf '  bash -c "$(curl -s https://ggml.ai/server-llm.sh)"\n\n' | ||||||
|  | } | ||||||
|  |  | ||||||
|  | while [[ $# -gt 0 ]]; do | ||||||
|  |     key="$1" | ||||||
|  |     case $key in | ||||||
|  |         --port) | ||||||
|  |             port="$2" | ||||||
|  |             shift | ||||||
|  |             shift | ||||||
|  |             ;; | ||||||
|  |         --repo) | ||||||
|  |             repo="$2" | ||||||
|  |             shift | ||||||
|  |             shift | ||||||
|  |             ;; | ||||||
|  |         --wtype) | ||||||
|  |             wtype="$2" | ||||||
|  |             shift | ||||||
|  |             shift | ||||||
|  |             ;; | ||||||
|  |         --backend) | ||||||
|  |             backend="$2" | ||||||
|  |             shift | ||||||
|  |             shift | ||||||
|  |             ;; | ||||||
|  |         --gpu-id) | ||||||
|  |             gpu_id="$2" | ||||||
|  |             shift | ||||||
|  |             shift | ||||||
|  |             ;; | ||||||
|  |         --n-parallel) | ||||||
|  |             n_parallel="$2" | ||||||
|  |             shift | ||||||
|  |             shift | ||||||
|  |             ;; | ||||||
|  |         --n-kv) | ||||||
|  |             n_kv="$2" | ||||||
|  |             shift | ||||||
|  |             shift | ||||||
|  |             ;; | ||||||
|  |         --verbose) | ||||||
|  |             verbose=1 | ||||||
|  |             shift | ||||||
|  |             ;; | ||||||
|  |         --help) | ||||||
|  |             print_usage | ||||||
|  |             exit 0 | ||||||
|  |             ;; | ||||||
|  |         *) | ||||||
|  |             echo "Unknown argument: $key" | ||||||
|  |             print_usage | ||||||
|  |             exit 1 | ||||||
|  |             ;; | ||||||
|  |     esac | ||||||
|  | done | ||||||
|  |  | ||||||
|  | # available weights types | ||||||
|  | wtypes=("F16" "Q8_0" "Q4_0" "Q4_1" "Q5_0" "Q5_1" "Q6_K" "Q5_K_M" "Q5_K_S" "Q4_K_M" "Q4_K_S" "Q3_K_L" "Q3_K_M" "Q3_K_S" "Q2_K") | ||||||
|  |  | ||||||
|  | wfiles=() | ||||||
|  | for wt in "${wtypes[@]}"; do | ||||||
|  |     wfiles+=("") | ||||||
|  | done | ||||||
|  |  | ||||||
|  | # sample repos | ||||||
|  | repos=( | ||||||
|  |     "https://huggingface.co/TheBloke/Llama-2-7B-GGUF" | ||||||
|  |     "https://huggingface.co/TheBloke/Llama-2-13B-GGUF" | ||||||
|  |     "https://huggingface.co/TheBloke/Llama-2-70B-GGUF" | ||||||
|  |     "https://huggingface.co/TheBloke/CodeLlama-7B-GGUF" | ||||||
|  |     "https://huggingface.co/TheBloke/CodeLlama-13B-GGUF" | ||||||
|  |     "https://huggingface.co/TheBloke/CodeLlama-34B-GGUF" | ||||||
|  |     "https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF" | ||||||
|  |     "https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF" | ||||||
|  |     "https://huggingface.co/TheBloke/OpenHermes-2-Mistral-7B-GGUF" | ||||||
|  |     "https://huggingface.co/TheBloke/CausalLM-7B-GGUF" | ||||||
|  | ) | ||||||
|  |  | ||||||
|  | printf "\n" | ||||||
|  | printf "[I] This is a helper script for deploying llama.cpp's server on this machine.\n\n" | ||||||
|  | printf "    Based on the options that follow, the script might download a model file\n" | ||||||
|  | printf "    from the internet, which can be a few GBs in size. The script will also\n" | ||||||
|  | printf "    build the latest llama.cpp source code from GitHub, which can be unstable.\n" | ||||||
|  | printf "\n" | ||||||
|  | printf "    Upon success, an HTTP server will be started and it will serve the selected\n" | ||||||
|  | printf "    model using llama.cpp for demonstration purposes.\n" | ||||||
|  | printf "\n" | ||||||
|  | printf "    Please note:\n" | ||||||
|  | printf "\n" | ||||||
|  | printf "    - All new data will be stored in the current folder\n" | ||||||
|  | printf "    - The server will be listening on all network interfaces\n" | ||||||
|  | printf "    - The server will run with default settings which are not always optimal\n" | ||||||
|  | printf "    - Do not judge the quality of a model based on the results from this script\n" | ||||||
|  | printf "    - Do not use this script to benchmark llama.cpp\n" | ||||||
|  | printf "    - Do not use this script in production\n" | ||||||
|  | printf "    - This script is only for demonstration purposes\n" | ||||||
|  | printf "\n" | ||||||
|  | printf "    If you don't know what you are doing, please press Ctrl-C to abort now\n" | ||||||
|  | printf "\n" | ||||||
|  | printf "    Press Enter to continue ...\n\n" | ||||||
|  |  | ||||||
|  | read | ||||||
|  |  | ||||||
|  | if [[ -z "$repo" ]]; then | ||||||
|  |     printf "[+] No repo provided from the command line\n" | ||||||
|  |     printf "    Please select a number from the list below or enter an URL:\n\n" | ||||||
|  |  | ||||||
|  |     is=0 | ||||||
|  |     for r in "${repos[@]}"; do | ||||||
|  |         printf "    %2d) %s\n" $is "$r" | ||||||
|  |         is=$((is+1)) | ||||||
|  |     done | ||||||
|  |  | ||||||
|  |     # ask for repo until index of sample repo is provided or an URL | ||||||
|  |     while [[ -z "$repo" ]]; do | ||||||
|  |         printf "\n    Or choose one from: https://huggingface.co/models?sort=trending&search=gguf\n\n" | ||||||
|  |         read -p "[+] Select repo: " repo | ||||||
|  |  | ||||||
|  |         # check if the input is a number | ||||||
|  |         if [[ "$repo" =~ ^[0-9]+$ ]]; then | ||||||
|  |             if [[ "$repo" -ge 0 && "$repo" -lt ${#repos[@]} ]]; then | ||||||
|  |                 repo="${repos[$repo]}" | ||||||
|  |             else | ||||||
|  |                 printf "[-] Invalid repo index: %s\n" "$repo" | ||||||
|  |                 repo="" | ||||||
|  |             fi | ||||||
|  |         elif [[ "$repo" =~ ^https?:// ]]; then | ||||||
|  |             repo="$repo" | ||||||
|  |         else | ||||||
|  |             printf "[-] Invalid repo URL: %s\n" "$repo" | ||||||
|  |             repo="" | ||||||
|  |         fi | ||||||
|  |     done | ||||||
|  | fi | ||||||
|  |  | ||||||
|  | # remove suffix | ||||||
|  | repo=$(echo "$repo" | sed -E 's/\/tree\/main$//g') | ||||||
|  |  | ||||||
|  | printf "[+] Checking for GGUF model files in %s\n" "$repo" | ||||||
|  |  | ||||||
|  | # find GGUF files in the source | ||||||
|  | # TODO: better logic | ||||||
|  | model_tree="${repo%/}/tree/main" | ||||||
|  | model_files=$(curl -s "$model_tree" | grep -i "\\.gguf</span>" | sed -E 's/.*<span class="truncate group-hover:underline">(.*)<\/span><\/a>/\1/g') | ||||||
|  |  | ||||||
|  | # list all files in the provided git repo | ||||||
|  | printf "[+] Model files:\n\n" | ||||||
|  | for file in $model_files; do | ||||||
|  |     # determine iw by grepping the filename with wtypes | ||||||
|  |     iw=-1 | ||||||
|  |     is=0 | ||||||
|  |     for wt in "${wtypes[@]}"; do | ||||||
|  |         # uppercase | ||||||
|  |         ufile=$(echo "$file" | tr '[:lower:]' '[:upper:]') | ||||||
|  |         if [[ "$ufile" =~ "$wt" ]]; then | ||||||
|  |             iw=$is | ||||||
|  |             break | ||||||
|  |         fi | ||||||
|  |         is=$((is+1)) | ||||||
|  |     done | ||||||
|  |  | ||||||
|  |     if [[ $iw -eq -1 ]]; then | ||||||
|  |         continue | ||||||
|  |     fi | ||||||
|  |  | ||||||
|  |     wfiles[$iw]="$file" | ||||||
|  |  | ||||||
|  |     have=" " | ||||||
|  |     if [[ -f "$file" ]]; then | ||||||
|  |         have="*" | ||||||
|  |     fi | ||||||
|  |  | ||||||
|  |     printf "    %2d) %s %s\n" $iw "$have" "$file" | ||||||
|  | done | ||||||
|  |  | ||||||
|  | # ask for weights type until provided and available | ||||||
|  | while [[ -z "$wtype" ]]; do | ||||||
|  |     printf "\n" | ||||||
|  |     read -p "[+] Select weight type: " wtype | ||||||
|  |     wfile="${wfiles[$wtype]}" | ||||||
|  |  | ||||||
|  |     if [[ -z "$wfile" ]]; then | ||||||
|  |         printf "[-] Invalid weight type: %s\n" "$wtype" | ||||||
|  |         wtype="" | ||||||
|  |     fi | ||||||
|  | done | ||||||
|  |  | ||||||
|  | printf "[+] Selected weight type: %s (%s)\n" "$wtype" "$wfile" | ||||||
|  |  | ||||||
|  | url="${repo%/}/resolve/main/$wfile" | ||||||
|  |  | ||||||
|  | # check file if the model has been downloaded before | ||||||
|  | chk="$wfile.chk" | ||||||
|  |  | ||||||
|  | # check if we should download the file | ||||||
|  | # - if $wfile does not exist | ||||||
|  | # - if $wfile exists but $chk does not exist | ||||||
|  | # - if $wfile exists and $chk exists but $wfile is newer than $chk | ||||||
|  | # TODO: better logic using git lfs info | ||||||
|  |  | ||||||
|  | do_download=0 | ||||||
|  |  | ||||||
|  | if [[ ! -f "$wfile" ]]; then | ||||||
|  |     do_download=1 | ||||||
|  | elif [[ ! -f "$chk" ]]; then | ||||||
|  |     do_download=1 | ||||||
|  | elif [[ "$wfile" -nt "$chk" ]]; then | ||||||
|  |     do_download=1 | ||||||
|  | fi | ||||||
|  |  | ||||||
|  | if [[ $do_download -eq 1 ]]; then | ||||||
|  |     printf "[+] Downloading weights from %s\n" "$url" | ||||||
|  |  | ||||||
|  |     # download the weights file | ||||||
|  |     curl -o "$wfile" -# -L "$url" | ||||||
|  |  | ||||||
|  |     # create a check file if successful | ||||||
|  |     if [[ $? -eq 0 ]]; then | ||||||
|  |         printf "[+] Creating check file %s\n" "$chk" | ||||||
|  |         touch "$chk" | ||||||
|  |     fi | ||||||
|  | else | ||||||
|  |     printf "[+] Using cached weights %s\n" "$wfile" | ||||||
|  | fi | ||||||
|  |  | ||||||
|  | # get latest llama.cpp and build | ||||||
|  |  | ||||||
|  | printf "[+] Downloading latest llama.cpp\n" | ||||||
|  |  | ||||||
|  | llama_cpp_dir="__llama_cpp_port_${port}__" | ||||||
|  |  | ||||||
|  | if [[ -d "$llama_cpp_dir" && ! -f "$llama_cpp_dir/__ggml_script__" ]]; then | ||||||
|  |     # if the dir exists and there isn't a file "__ggml_script__" in it, abort | ||||||
|  |     printf "[-] Directory %s already exists\n" "$llama_cpp_dir" | ||||||
|  |     printf "[-] Please remove it and try again\n" | ||||||
|  |     exit 1 | ||||||
|  | elif [[ -d "$llama_cpp_dir" ]]; then | ||||||
|  |     printf "[+] Directory %s already exists\n" "$llama_cpp_dir" | ||||||
|  |     printf "[+] Using cached llama.cpp\n" | ||||||
|  |  | ||||||
|  |     cd "$llama_cpp_dir" | ||||||
|  |     git reset --hard | ||||||
|  |     git fetch | ||||||
|  |     git checkout origin/master | ||||||
|  |  | ||||||
|  |     cd .. | ||||||
|  | else | ||||||
|  |     printf "[+] Cloning llama.cpp\n" | ||||||
|  |  | ||||||
|  |     git clone https://github.com/ggerganov/llama.cpp "$llama_cpp_dir" | ||||||
|  | fi | ||||||
|  |  | ||||||
|  | # mark that that the directory is made by this script | ||||||
|  | touch "$llama_cpp_dir/__ggml_script__" | ||||||
|  |  | ||||||
|  | if [[ $verbose -eq 1 ]]; then | ||||||
|  |     set -x | ||||||
|  | fi | ||||||
|  |  | ||||||
|  | # build | ||||||
|  | cd "$llama_cpp_dir" | ||||||
|  |  | ||||||
|  | make clean | ||||||
|  |  | ||||||
|  | log="--silent" | ||||||
|  | if [[ $verbose -eq 1 ]]; then | ||||||
|  |     log="" | ||||||
|  | fi | ||||||
|  |  | ||||||
|  | if [[ "$backend" == "cuda" ]]; then | ||||||
|  |     printf "[+] Building with CUDA backend\n" | ||||||
|  |     LLAMA_CUBLAS=1 make -j server $log | ||||||
|  | elif [[ "$backend" == "cpu" ]]; then | ||||||
|  |     printf "[+] Building with CPU backend\n" | ||||||
|  |     make -j server $log | ||||||
|  | elif [[ "$backend" == "metal" ]]; then | ||||||
|  |     printf "[+] Building with Metal backend\n" | ||||||
|  |     make -j server $log | ||||||
|  | elif [[ "$backend" == "opencl" ]]; then | ||||||
|  |     printf "[+] Building with OpenCL backend\n" | ||||||
|  |     LLAMA_CLBLAST=1 make -j server $log | ||||||
|  | else | ||||||
|  |     printf "[-] Unknown backend: %s\n" "$backend" | ||||||
|  |     exit 1 | ||||||
|  | fi | ||||||
|  |  | ||||||
|  | # run the server | ||||||
|  |  | ||||||
|  | printf "[+] Running server\n" | ||||||
|  |  | ||||||
|  | args="" | ||||||
|  | if [[ "$backend" == "cuda" ]]; then | ||||||
|  |     export CUDA_VISIBLE_DEVICES=$gpu_id | ||||||
|  |     args="-ngl 999" | ||||||
|  | elif [[ "$backend" == "cpu" ]]; then | ||||||
|  |     args="-ngl 0" | ||||||
|  | elif [[ "$backend" == "metal" ]]; then | ||||||
|  |     args="-ngl 999" | ||||||
|  | elif [[ "$backend" == "opencl" ]]; then | ||||||
|  |     args="-ngl 999" | ||||||
|  | else | ||||||
|  |     printf "[-] Unknown backend: %s\n" "$backend" | ||||||
|  |     exit 1 | ||||||
|  | fi | ||||||
|  |  | ||||||
|  | if [[ $verbose -eq 1 ]]; then | ||||||
|  |     args="$args --verbose" | ||||||
|  | fi | ||||||
|  |  | ||||||
|  | ./server -m "../$wfile" --host 0.0.0.0 --port "$port" -c $n_kv -np "$n_parallel" $args | ||||||
|  |  | ||||||
|  | exit 0 | ||||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov