mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-04 09:32:00 +00:00 
			
		
		
		
	scripts : add deploy-server.sh
This commit is contained in:
		
							
								
								
									
										341
									
								
								scripts/deploy-server.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										341
									
								
								scripts/deploy-server.sh
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,341 @@
 | 
				
			|||||||
 | 
					#!/bin/bash
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					# Helper script for deploying llama.cpp server with a single Bash command
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					# - Works on Linux and macOS
 | 
				
			||||||
 | 
					# - Supports: CPU, CUDA, Metal, OpenCL
 | 
				
			||||||
 | 
					# - Can run all GGUF models from HuggingFace
 | 
				
			||||||
 | 
					# - Always build latest llama.cpp from GitHub
 | 
				
			||||||
 | 
					# - Might be unstable!
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					# Usage:
 | 
				
			||||||
 | 
					#   ./deploy-server.sh [--port] [--repo] [--wtype] [--backend] [--gpu-id] [--n-parallel] [--n-kv] [--verbose]
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					#   --port:       port number, default is 8888
 | 
				
			||||||
 | 
					#   --repo:       path to a repo containing GGUF model files
 | 
				
			||||||
 | 
					#   --wtype:      weights type (f16, q8_0, q4_0, q4_1), default is user-input
 | 
				
			||||||
 | 
					#   --backend:    cpu, cuda, metal, opencl, depends on the OS
 | 
				
			||||||
 | 
					#   --gpu-id:     gpu id, default is 0
 | 
				
			||||||
 | 
					#   --n-parallel: number of parallel requests, default is 8
 | 
				
			||||||
 | 
					#   --n-kv:       KV cache size, default is 4096
 | 
				
			||||||
 | 
					#   --verbose:    verbose output
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					# Example:
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					#   curl https://ggml.ai/deploy-server.sh | bash -s --
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					set -e
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# required utils: curl, git, make
 | 
				
			||||||
 | 
					if ! command -v curl &> /dev/null; then
 | 
				
			||||||
 | 
					    printf "[-] curl not found\n"
 | 
				
			||||||
 | 
					    exit 1
 | 
				
			||||||
 | 
					fi
 | 
				
			||||||
 | 
					if ! command -v git &> /dev/null; then
 | 
				
			||||||
 | 
					    printf "[-] git not found\n"
 | 
				
			||||||
 | 
					    exit 1
 | 
				
			||||||
 | 
					fi
 | 
				
			||||||
 | 
					if ! command -v make &> /dev/null; then
 | 
				
			||||||
 | 
					    printf "[-] make not found\n"
 | 
				
			||||||
 | 
					    exit 1
 | 
				
			||||||
 | 
					fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# parse arguments
 | 
				
			||||||
 | 
					port=8888
 | 
				
			||||||
 | 
					repo=""
 | 
				
			||||||
 | 
					wtype=""
 | 
				
			||||||
 | 
					backend="cpu"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# if macOS, use metal backend by default
 | 
				
			||||||
 | 
					if [[ "$OSTYPE" == "darwin"* ]]; then
 | 
				
			||||||
 | 
					    backend="metal"
 | 
				
			||||||
 | 
					elif command -v nvcc &> /dev/null; then
 | 
				
			||||||
 | 
					    backend="cuda"
 | 
				
			||||||
 | 
					fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					gpu_id=0
 | 
				
			||||||
 | 
					n_parallel=8
 | 
				
			||||||
 | 
					n_kv=4096
 | 
				
			||||||
 | 
					verbose=0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					while [[ $# -gt 0 ]]; do
 | 
				
			||||||
 | 
					    key="$1"
 | 
				
			||||||
 | 
					    case $key in
 | 
				
			||||||
 | 
					        --port)
 | 
				
			||||||
 | 
					            port="$2"
 | 
				
			||||||
 | 
					            shift
 | 
				
			||||||
 | 
					            shift
 | 
				
			||||||
 | 
					            ;;
 | 
				
			||||||
 | 
					        --repo)
 | 
				
			||||||
 | 
					            repo="$2"
 | 
				
			||||||
 | 
					            shift
 | 
				
			||||||
 | 
					            shift
 | 
				
			||||||
 | 
					            ;;
 | 
				
			||||||
 | 
					        --wtype)
 | 
				
			||||||
 | 
					            wtype="$2"
 | 
				
			||||||
 | 
					            shift
 | 
				
			||||||
 | 
					            shift
 | 
				
			||||||
 | 
					            ;;
 | 
				
			||||||
 | 
					        --backend)
 | 
				
			||||||
 | 
					            backend="$2"
 | 
				
			||||||
 | 
					            shift
 | 
				
			||||||
 | 
					            shift
 | 
				
			||||||
 | 
					            ;;
 | 
				
			||||||
 | 
					        --gpu-id)
 | 
				
			||||||
 | 
					            gpu_id="$2"
 | 
				
			||||||
 | 
					            shift
 | 
				
			||||||
 | 
					            shift
 | 
				
			||||||
 | 
					            ;;
 | 
				
			||||||
 | 
					        --n-parallel)
 | 
				
			||||||
 | 
					            n_parallel="$2"
 | 
				
			||||||
 | 
					            shift
 | 
				
			||||||
 | 
					            shift
 | 
				
			||||||
 | 
					            ;;
 | 
				
			||||||
 | 
					        --n-kv)
 | 
				
			||||||
 | 
					            n_kv="$2"
 | 
				
			||||||
 | 
					            shift
 | 
				
			||||||
 | 
					            shift
 | 
				
			||||||
 | 
					            ;;
 | 
				
			||||||
 | 
					        --verbose)
 | 
				
			||||||
 | 
					            verbose=1
 | 
				
			||||||
 | 
					            shift
 | 
				
			||||||
 | 
					            ;;
 | 
				
			||||||
 | 
					        *)
 | 
				
			||||||
 | 
					            echo "Unknown argument: $key"
 | 
				
			||||||
 | 
					            exit 1
 | 
				
			||||||
 | 
					            ;;
 | 
				
			||||||
 | 
					    esac
 | 
				
			||||||
 | 
					done
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# available weights types
 | 
				
			||||||
 | 
					wtypes=("F16" "Q8_0" "Q4_0" "Q4_1" "Q5_0" "Q5_1" "Q6_K" "Q5_K_M" "Q5_K_S" "Q4_K_M" "Q4_K_S" "Q3_K_L" "Q3_K_M" "Q3_K_S" "Q2_K")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					wfiles=()
 | 
				
			||||||
 | 
					for wt in "${wtypes[@]}"; do
 | 
				
			||||||
 | 
					    wfiles+=("")
 | 
				
			||||||
 | 
					done
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# sample repos
 | 
				
			||||||
 | 
					repos=(
 | 
				
			||||||
 | 
					    "https://huggingface.co/TheBloke/Llama-2-7B-GGUF"
 | 
				
			||||||
 | 
					    "https://huggingface.co/TheBloke/Llama-2-13B-GGUF"
 | 
				
			||||||
 | 
					    "https://huggingface.co/TheBloke/Llama-2-70B-GGUF"
 | 
				
			||||||
 | 
					    "https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF"
 | 
				
			||||||
 | 
					    "https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF"
 | 
				
			||||||
 | 
					    "https://huggingface.co/TheBloke/OpenHermes-2-Mistral-7B-GGUF"
 | 
				
			||||||
 | 
					    "https://huggingface.co/TheBloke/CausalLM-7B-GGUF"
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					printf "\n"
 | 
				
			||||||
 | 
					printf "[I] This is a helper script for deploying llama.cpp's server on this machine.\n\n"
 | 
				
			||||||
 | 
					printf "    Based on the options that follow, the script might download a model file\n"
 | 
				
			||||||
 | 
					printf "    from the internet, which can be a few GBs in size. The script will also\n"
 | 
				
			||||||
 | 
					printf "    build the latest llama.cpp source code from GitHub, which can be unstable.\n\n"
 | 
				
			||||||
 | 
					printf "    If you don't know what you are doing, please press Ctrl-C to abort now\n\n"
 | 
				
			||||||
 | 
					printf "    Press Enter to continue ...\n\n"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					read
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if [[ -z "$repo" ]]; then
 | 
				
			||||||
 | 
					    printf "[+] No repo provided from the command line\n"
 | 
				
			||||||
 | 
					    printf "    Please select a number from the sample repos below or enter an URL:\n\n"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    is=0
 | 
				
			||||||
 | 
					    for r in "${repos[@]}"; do
 | 
				
			||||||
 | 
					        printf "    %2d) %s\n" $is "$r"
 | 
				
			||||||
 | 
					        is=$((is+1))
 | 
				
			||||||
 | 
					    done
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # ask for repo until index of sample repo is provided or an URL
 | 
				
			||||||
 | 
					    while [[ -z "$repo" ]]; do
 | 
				
			||||||
 | 
					        printf "\n    Or choose one from: https://huggingface.co/models?sort=trending&search=gguf\n\n"
 | 
				
			||||||
 | 
					        read -p "[+] Select repo: " repo
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # check if the input is a number
 | 
				
			||||||
 | 
					        if [[ "$repo" =~ ^[0-9]+$ ]]; then
 | 
				
			||||||
 | 
					            if [[ "$repo" -ge 0 && "$repo" -lt ${#repos[@]} ]]; then
 | 
				
			||||||
 | 
					                repo="${repos[$repo]}"
 | 
				
			||||||
 | 
					            else
 | 
				
			||||||
 | 
					                printf "[-] Invalid repo index: %s\n" "$repo"
 | 
				
			||||||
 | 
					                repo=""
 | 
				
			||||||
 | 
					            fi
 | 
				
			||||||
 | 
					        elif [[ "$repo" =~ ^https?:// ]]; then
 | 
				
			||||||
 | 
					            repo="$repo"
 | 
				
			||||||
 | 
					        else
 | 
				
			||||||
 | 
					            printf "[-] Invalid repo URL: %s\n" "$repo"
 | 
				
			||||||
 | 
					            repo=""
 | 
				
			||||||
 | 
					        fi
 | 
				
			||||||
 | 
					    done
 | 
				
			||||||
 | 
					fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# remove suffix
 | 
				
			||||||
 | 
					repo=$(echo "$repo" | sed -E 's/\/tree\/main$//g')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					printf "[+] Checking for GGUF model files in %s\n" "$repo"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					model_tree="${repo%/}/tree/main"
 | 
				
			||||||
 | 
					model_files=$(curl -s "$model_tree" | grep -i "\\.gguf</span>" | sed -E 's/.*<span class="truncate group-hover:underline">(.*)<\/span><\/a>/\1/g')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# list all files in the provided git repo
 | 
				
			||||||
 | 
					printf "[+] Model files:\n\n"
 | 
				
			||||||
 | 
					for file in $model_files; do
 | 
				
			||||||
 | 
					    # determine iw by grepping the filename with wtypes
 | 
				
			||||||
 | 
					    iw=-1
 | 
				
			||||||
 | 
					    is=0
 | 
				
			||||||
 | 
					    for wt in "${wtypes[@]}"; do
 | 
				
			||||||
 | 
					        # uppercase
 | 
				
			||||||
 | 
					        ufile=$(echo "$file" | tr '[:lower:]' '[:upper:]')
 | 
				
			||||||
 | 
					        if [[ "$ufile" =~ "$wt" ]]; then
 | 
				
			||||||
 | 
					            iw=$is
 | 
				
			||||||
 | 
					            break
 | 
				
			||||||
 | 
					        fi
 | 
				
			||||||
 | 
					        is=$((is+1))
 | 
				
			||||||
 | 
					    done
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if [[ $iw -eq -1 ]]; then
 | 
				
			||||||
 | 
					        continue
 | 
				
			||||||
 | 
					    fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    wfiles[$iw]="$file"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    have=" "
 | 
				
			||||||
 | 
					    if [[ -f "$file" ]]; then
 | 
				
			||||||
 | 
					        have="*"
 | 
				
			||||||
 | 
					    fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    printf "    %2d) %s %s\n" $iw "$have" "$file"
 | 
				
			||||||
 | 
					done
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# ask for weights type until provided and available
 | 
				
			||||||
 | 
					while [[ -z "$wtype" ]]; do
 | 
				
			||||||
 | 
					    printf "\n"
 | 
				
			||||||
 | 
					    read -p "[+] Select weight type: " wtype
 | 
				
			||||||
 | 
					    wfile="${wfiles[$wtype]}"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if [[ -z "$wfile" ]]; then
 | 
				
			||||||
 | 
					        printf "[-] Invalid weight type: %s\n" "$wtype"
 | 
				
			||||||
 | 
					        wtype=""
 | 
				
			||||||
 | 
					    fi
 | 
				
			||||||
 | 
					done
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					printf "[+] Selected weight type: %s (%s)\n" "$wtype" "$wfile"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					url="${repo%/}/resolve/main/$wfile"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# check file if the model has been downloaded before
 | 
				
			||||||
 | 
					chk="$wfile.chk"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# check if we should download the file
 | 
				
			||||||
 | 
					# - if $wfile does not exist
 | 
				
			||||||
 | 
					# - if $wfile exists but $chk does not exist
 | 
				
			||||||
 | 
					# - if $wfile exists and $chk exists but $wfile is newer than $chk
 | 
				
			||||||
 | 
					do_download=0
 | 
				
			||||||
 | 
					if [[ ! -f "$wfile" ]]; then
 | 
				
			||||||
 | 
					    do_download=1
 | 
				
			||||||
 | 
					elif [[ ! -f "$chk" ]]; then
 | 
				
			||||||
 | 
					    do_download=1
 | 
				
			||||||
 | 
					elif [[ "$wfile" -nt "$chk" ]]; then
 | 
				
			||||||
 | 
					    do_download=1
 | 
				
			||||||
 | 
					fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if [[ $do_download -eq 1 ]]; then
 | 
				
			||||||
 | 
					    printf "[+] Downloading weights from %s\n" "$url"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # download the weights file
 | 
				
			||||||
 | 
					    curl -o "$wfile" -# -L "$url"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # create a check file if successful
 | 
				
			||||||
 | 
					    if [[ $? -eq 0 ]]; then
 | 
				
			||||||
 | 
					        printf "[+] Creating check file %s\n" "$chk"
 | 
				
			||||||
 | 
					        touch "$chk"
 | 
				
			||||||
 | 
					    fi
 | 
				
			||||||
 | 
					else
 | 
				
			||||||
 | 
					    printf "[+] Using cached weights %s\n" "$wfile"
 | 
				
			||||||
 | 
					fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# get latest llama.cpp and build
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					printf "[+] Downloading latest llama.cpp\n"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					llama_cpp_dir="__llama_cpp_port_${port}__"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if [[ -d "$llama_cpp_dir" && ! -f "$llama_cpp_dir/__ggml_script__" ]]; then
 | 
				
			||||||
 | 
					    # if the dir exists and there isn't a file "__ggml_script__" in it, abort
 | 
				
			||||||
 | 
					    printf "[-] Directory %s already exists\n" "$llama_cpp_dir"
 | 
				
			||||||
 | 
					    printf "[-] Please remove it and try again\n"
 | 
				
			||||||
 | 
					    exit 1
 | 
				
			||||||
 | 
					elif [[ -d "$llama_cpp_dir" ]]; then
 | 
				
			||||||
 | 
					    printf "[+] Directory %s already exists\n" "$llama_cpp_dir"
 | 
				
			||||||
 | 
					    printf "[+] Using cached llama.cpp\n"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    cd "$llama_cpp_dir"
 | 
				
			||||||
 | 
					    git reset --hard
 | 
				
			||||||
 | 
					    git fetch
 | 
				
			||||||
 | 
					    git checkout origin/master
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    cd ..
 | 
				
			||||||
 | 
					else
 | 
				
			||||||
 | 
					    git clone https://github.com/ggerganov/llama.cpp "$llama_cpp_dir"
 | 
				
			||||||
 | 
					fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# mark that that the directory is made by this script
 | 
				
			||||||
 | 
					touch "$llama_cpp_dir/__ggml_script__"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if [[ $verbose -eq 1 ]]; then
 | 
				
			||||||
 | 
					    set -x
 | 
				
			||||||
 | 
					fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# build
 | 
				
			||||||
 | 
					cd "$llama_cpp_dir"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					make clean
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					log="--silent"
 | 
				
			||||||
 | 
					if [[ $verbose -eq 1 ]]; then
 | 
				
			||||||
 | 
					    log=""
 | 
				
			||||||
 | 
					fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if [[ "$backend" == "cuda" ]]; then
 | 
				
			||||||
 | 
					    printf "[+] Building with CUDA backend\n"
 | 
				
			||||||
 | 
					    LLAMA_CUBLAS=1 make -j server $log
 | 
				
			||||||
 | 
					elif [[ "$backend" == "cpu" ]]; then
 | 
				
			||||||
 | 
					    printf "[+] Building with CPU backend\n"
 | 
				
			||||||
 | 
					    make -j server $log
 | 
				
			||||||
 | 
					elif [[ "$backend" == "metal" ]]; then
 | 
				
			||||||
 | 
					    printf "[+] Building with Metal backend\n"
 | 
				
			||||||
 | 
					    make -j server $log
 | 
				
			||||||
 | 
					elif [[ "$backend" == "opencl" ]]; then
 | 
				
			||||||
 | 
					    printf "[+] Building with OpenCL backend\n"
 | 
				
			||||||
 | 
					    LLAMA_CLBLAST=1 make -j server $log
 | 
				
			||||||
 | 
					else
 | 
				
			||||||
 | 
					    printf "[-] Unknown backend: %s\n" "$backend"
 | 
				
			||||||
 | 
					    exit 1
 | 
				
			||||||
 | 
					fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# run the server
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					printf "[+] Running server\n"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					args=""
 | 
				
			||||||
 | 
					if [[ "$backend" == "cuda" ]]; then
 | 
				
			||||||
 | 
					    export CUDA_VISIBLE_DEVICES=$gpu_id
 | 
				
			||||||
 | 
					    args="-ngl 999"
 | 
				
			||||||
 | 
					elif [[ "$backend" == "cpu" ]]; then
 | 
				
			||||||
 | 
					    args="-ngl 0"
 | 
				
			||||||
 | 
					elif [[ "$backend" == "metal" ]]; then
 | 
				
			||||||
 | 
					    args="-ngl 999"
 | 
				
			||||||
 | 
					elif [[ "$backend" == "opencl" ]]; then
 | 
				
			||||||
 | 
					    args="-ngl 999"
 | 
				
			||||||
 | 
					else
 | 
				
			||||||
 | 
					    printf "[-] Unknown backend: %s\n" "$backend"
 | 
				
			||||||
 | 
					    exit 1
 | 
				
			||||||
 | 
					fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if [[ $verbose -eq 1 ]]; then
 | 
				
			||||||
 | 
					    args="$args --verbose"
 | 
				
			||||||
 | 
					fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					./server -m "../$wfile" --host 0.0.0.0 --port "$port" -c $n_kv -np "$n_parallel" $args
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					exit 0
 | 
				
			||||||
		Reference in New Issue
	
	Block a user