mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	server: update refs -> llama-server
gitignore llama-server
This commit is contained in:
		| @@ -25,13 +25,13 @@ ENV LLAMA_CUDA=1 | |||||||
| # Enable cURL | # Enable cURL | ||||||
| ENV LLAMA_CURL=1 | ENV LLAMA_CURL=1 | ||||||
|  |  | ||||||
| RUN make -j$(nproc) server | RUN make -j$(nproc) llama-server | ||||||
|  |  | ||||||
| FROM ${BASE_CUDA_RUN_CONTAINER} as runtime | FROM ${BASE_CUDA_RUN_CONTAINER} as runtime | ||||||
|  |  | ||||||
| RUN apt-get update && \ | RUN apt-get update && \ | ||||||
|     apt-get install -y libcurl4-openssl-dev libgomp1 |     apt-get install -y libcurl4-openssl-dev libgomp1 | ||||||
|  |  | ||||||
| COPY --from=build /app/server /server | COPY --from=build /app/llama-server /llama-server | ||||||
|  |  | ||||||
| ENTRYPOINT [ "/server" ] | ENTRYPOINT [ "/llama-server" ] | ||||||
|   | |||||||
| @@ -38,8 +38,8 @@ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRO | |||||||
| RUN apt-get update && \ | RUN apt-get update && \ | ||||||
|     apt-get install -y libcurl4-openssl-dev |     apt-get install -y libcurl4-openssl-dev | ||||||
|  |  | ||||||
| COPY --from=build /app/build/bin/server /server | COPY --from=build /app/build/bin/llama-server /llama-server | ||||||
|  |  | ||||||
| ENV LC_ALL=C.utf8 | ENV LC_ALL=C.utf8 | ||||||
|  |  | ||||||
| ENTRYPOINT [ "/server" ] | ENTRYPOINT [ "/llama-server" ] | ||||||
|   | |||||||
| @@ -45,6 +45,6 @@ ENV LLAMA_CURL=1 | |||||||
| RUN apt-get update && \ | RUN apt-get update && \ | ||||||
|     apt-get install -y libcurl4-openssl-dev |     apt-get install -y libcurl4-openssl-dev | ||||||
|  |  | ||||||
| RUN make -j$(nproc) | RUN make -j$(nproc) llama-server | ||||||
|  |  | ||||||
| ENTRYPOINT [ "/app/server" ] | ENTRYPOINT [ "/app/llama-server" ] | ||||||
|   | |||||||
| @@ -23,9 +23,9 @@ RUN cmake -B build -DLLAMA_VULKAN=1 -DLLAMA_CURL=1 && \ | |||||||
|  |  | ||||||
| # Clean up | # Clean up | ||||||
| WORKDIR / | WORKDIR / | ||||||
| RUN cp /app/build/bin/server /server && \ | RUN cp /app/build/bin/llama-server /llama-server && \ | ||||||
|     rm -rf /app |     rm -rf /app | ||||||
|  |  | ||||||
| ENV LC_ALL=C.utf8 | ENV LC_ALL=C.utf8 | ||||||
|  |  | ||||||
| ENTRYPOINT [ "/server" ] | ENTRYPOINT [ "/llama-server" ] | ||||||
|   | |||||||
| @@ -11,15 +11,15 @@ COPY . . | |||||||
|  |  | ||||||
| ENV LLAMA_CURL=1 | ENV LLAMA_CURL=1 | ||||||
|  |  | ||||||
| RUN make -j$(nproc) server | RUN make -j$(nproc) llama-server | ||||||
|  |  | ||||||
| FROM ubuntu:$UBUNTU_VERSION as runtime | FROM ubuntu:$UBUNTU_VERSION as runtime | ||||||
|  |  | ||||||
| RUN apt-get update && \ | RUN apt-get update && \ | ||||||
|     apt-get install -y libcurl4-openssl-dev libgomp1 |     apt-get install -y libcurl4-openssl-dev libgomp1 | ||||||
|  |  | ||||||
| COPY --from=build /app/server /server | COPY --from=build /app/llama-server /llama-server | ||||||
|  |  | ||||||
| ENV LC_ALL=C.utf8 | ENV LC_ALL=C.utf8 | ||||||
|  |  | ||||||
| ENTRYPOINT [ "/server" ] | ENTRYPOINT [ "/llama-server" ] | ||||||
|   | |||||||
| @@ -26,7 +26,7 @@ elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then | |||||||
|         fi |         fi | ||||||
|     done |     done | ||||||
| elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then | elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then | ||||||
|     ./server "$@" |     ./llama-server "$@" | ||||||
| else | else | ||||||
|     echo "Unknown command: $arg1" |     echo "Unknown command: $arg1" | ||||||
|     echo "Available commands: " |     echo "Available commands: " | ||||||
|   | |||||||
							
								
								
									
										2
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @@ -76,7 +76,7 @@ models-mnt | |||||||
| /quantize-stats | /quantize-stats | ||||||
| /result | /result | ||||||
| /save-load-state | /save-load-state | ||||||
| /server | /llama-server | ||||||
| /simple | /simple | ||||||
| /batched | /batched | ||||||
| /batched-bench | /batched-bench | ||||||
|   | |||||||
| @@ -1,5 +1,5 @@ | |||||||
| # Usage: | # Usage: | ||||||
| #! ./server -m some-model.gguf & | #! ./llama-server -m some-model.gguf & | ||||||
| #! pip install pydantic | #! pip install pydantic | ||||||
| #! python json-schema-pydantic-example.py | #! python json-schema-pydantic-example.py | ||||||
|  |  | ||||||
|   | |||||||
| @@ -16,7 +16,7 @@ GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 4096 --batch-size 1024}" | |||||||
|  |  | ||||||
|  |  | ||||||
| # shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS | # shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS | ||||||
| ./server $GEN_OPTIONS \ | ./llama-server $GEN_OPTIONS \ | ||||||
|   --model "$MODEL" \ |   --model "$MODEL" \ | ||||||
|   --threads "$N_THREAD" \ |   --threads "$N_THREAD" \ | ||||||
|   --rope-freq-scale 1.0 \ |   --rope-freq-scale 1.0 \ | ||||||
|   | |||||||
| @@ -80,26 +80,26 @@ The project is under active development, and we are [looking for feedback and co | |||||||
|  |  | ||||||
| ## Build | ## Build | ||||||
|  |  | ||||||
| `server` is built alongside everything else from the root of the project | `llama-server` is built alongside everything else from the root of the project | ||||||
|  |  | ||||||
| - Using `make`: | - Using `make`: | ||||||
|  |  | ||||||
|   ```bash |   ```bash | ||||||
|   make server |   make llama-server | ||||||
|   ``` |   ``` | ||||||
|  |  | ||||||
| - Using `CMake`: | - Using `CMake`: | ||||||
|  |  | ||||||
|   ```bash |   ```bash | ||||||
|   cmake -B build |   cmake -B build | ||||||
|   cmake --build build --config Release -t server |   cmake --build build --config Release -t llama-server | ||||||
|   ``` |   ``` | ||||||
|  |  | ||||||
|   Binary is at `./build/bin/server` |   Binary is at `./build/bin/llama-server` | ||||||
|  |  | ||||||
| ## Build with SSL | ## Build with SSL | ||||||
|  |  | ||||||
| `server` can also be built with SSL support using OpenSSL 3 | `llama-server` can also be built with SSL support using OpenSSL 3 | ||||||
|  |  | ||||||
| - Using `make`: | - Using `make`: | ||||||
|  |  | ||||||
| @@ -107,14 +107,14 @@ The project is under active development, and we are [looking for feedback and co | |||||||
|   # NOTE: For non-system openssl, use the following: |   # NOTE: For non-system openssl, use the following: | ||||||
|   #   CXXFLAGS="-I /path/to/openssl/include" |   #   CXXFLAGS="-I /path/to/openssl/include" | ||||||
|   #   LDFLAGS="-L /path/to/openssl/lib" |   #   LDFLAGS="-L /path/to/openssl/lib" | ||||||
|   make LLAMA_SERVER_SSL=true server |   make LLAMA_SERVER_SSL=true llama-server | ||||||
|   ``` |   ``` | ||||||
|  |  | ||||||
| - Using `CMake`: | - Using `CMake`: | ||||||
|  |  | ||||||
|   ```bash |   ```bash | ||||||
|   cmake -B build -DLLAMA_SERVER_SSL=ON |   cmake -B build -DLLAMA_SERVER_SSL=ON | ||||||
|   cmake --build build --config Release -t server |   cmake --build build --config Release -t llama-server | ||||||
|   ``` |   ``` | ||||||
|  |  | ||||||
| ## Quick Start | ## Quick Start | ||||||
| @@ -124,13 +124,13 @@ To get started right away, run the following command, making sure to use the cor | |||||||
| ### Unix-based systems (Linux, macOS, etc.) | ### Unix-based systems (Linux, macOS, etc.) | ||||||
|  |  | ||||||
| ```bash | ```bash | ||||||
| ./server -m models/7B/ggml-model.gguf -c 2048 | ./llama-server -m models/7B/ggml-model.gguf -c 2048 | ||||||
| ``` | ``` | ||||||
|  |  | ||||||
| ### Windows | ### Windows | ||||||
|  |  | ||||||
| ```powershell | ```powershell | ||||||
| server.exe -m models\7B\ggml-model.gguf -c 2048 | llama-server.exe -m models\7B\ggml-model.gguf -c 2048 | ||||||
| ``` | ``` | ||||||
|  |  | ||||||
| The above command will start a server that by default listens on `127.0.0.1:8080`. | The above command will start a server that by default listens on `127.0.0.1:8080`. | ||||||
| @@ -629,11 +629,11 @@ bash chat.sh | |||||||
|  |  | ||||||
| ### OAI-like API | ### OAI-like API | ||||||
|  |  | ||||||
| The HTTP `server` supports an OAI-like API: https://github.com/openai/openai-openapi | The HTTP `llama-server` supports an OAI-like API: https://github.com/openai/openai-openapi | ||||||
|  |  | ||||||
| ### API errors | ### API errors | ||||||
|  |  | ||||||
| `server` returns errors in the same format as OAI: https://github.com/openai/openai-openapi | `llama-server` returns errors in the same format as OAI: https://github.com/openai/openai-openapi | ||||||
|  |  | ||||||
| Example of an error: | Example of an error: | ||||||
|  |  | ||||||
|   | |||||||
| @@ -99,7 +99,7 @@ The `bench.py` script does several steps: | |||||||
| It aims to be used in the CI, but you can run it manually: | It aims to be used in the CI, but you can run it manually: | ||||||
|  |  | ||||||
| ```shell | ```shell | ||||||
| LLAMA_SERVER_BIN_PATH=../../../cmake-build-release/bin/server python bench.py \ | LLAMA_SERVER_BIN_PATH=../../../cmake-build-release/bin/llama-server python bench.py \ | ||||||
|               --runner-label local \ |               --runner-label local \ | ||||||
|               --name local \ |               --name local \ | ||||||
|               --branch `git rev-parse --abbrev-ref HEAD` \ |               --branch `git rev-parse --abbrev-ref HEAD` \ | ||||||
|   | |||||||
| @@ -245,7 +245,7 @@ def start_server(args): | |||||||
|  |  | ||||||
| def start_server_background(args): | def start_server_background(args): | ||||||
|     # Start the server |     # Start the server | ||||||
|     server_path = '../../../build/bin/server' |     server_path = '../../../build/bin/llama-server' | ||||||
|     if 'LLAMA_SERVER_BIN_PATH' in os.environ: |     if 'LLAMA_SERVER_BIN_PATH' in os.environ: | ||||||
|         server_path = os.environ['LLAMA_SERVER_BIN_PATH'] |         server_path = os.environ['LLAMA_SERVER_BIN_PATH'] | ||||||
|     server_args = [ |     server_args = [ | ||||||
|   | |||||||
| @@ -44,12 +44,12 @@ http module. | |||||||
|  |  | ||||||
| ### running using examples/server | ### running using examples/server | ||||||
|  |  | ||||||
| bin/server -m path/model.gguf --path ../examples/server/public_simplechat [--port PORT] | ./llama-server -m path/model.gguf --path examples/server/public_simplechat [--port PORT] | ||||||
|  |  | ||||||
| ### running using python3's server module | ### running using python3's server module | ||||||
|  |  | ||||||
| first run examples/server | first run examples/server | ||||||
| * bin/server -m path/model.gguf | * ./llama-server -m path/model.gguf | ||||||
|  |  | ||||||
| next run this web front end in examples/server/public_simplechat | next run this web front end in examples/server/public_simplechat | ||||||
| * cd ../examples/server/public_simplechat | * cd ../examples/server/public_simplechat | ||||||
|   | |||||||
| @@ -40,7 +40,7 @@ It's possible to override some scenario steps values with environment variables: | |||||||
| | variable                 | description                                                                                    | | | variable                 | description                                                                                    | | ||||||
| |--------------------------|------------------------------------------------------------------------------------------------| | |--------------------------|------------------------------------------------------------------------------------------------| | ||||||
| | `PORT`                   | `context.server_port` to set the listening port of the server during scenario, default: `8080` | | | `PORT`                   | `context.server_port` to set the listening port of the server during scenario, default: `8080` | | ||||||
| | `LLAMA_SERVER_BIN_PATH`  | to change the server binary path, default: `../../../build/bin/server`                         | | | `LLAMA_SERVER_BIN_PATH`  | to change the server binary path, default: `../../../build/bin/llama-server`                         | | ||||||
| | `DEBUG`                  | "ON" to enable steps and server verbose mode `--verbose`                                       | | | `DEBUG`                  | "ON" to enable steps and server verbose mode `--verbose`                                       | | ||||||
| | `SERVER_LOG_FORMAT_JSON` | if set switch server logs to json format                                                       | | | `SERVER_LOG_FORMAT_JSON` | if set switch server logs to json format                                                       | | ||||||
| | `N_GPU_LAYERS`           | number of model layers to offload to VRAM `-ngl --n-gpu-layers`                                | | | `N_GPU_LAYERS`           | number of model layers to offload to VRAM `-ngl --n-gpu-layers`                                | | ||||||
|   | |||||||
| @@ -1272,9 +1272,9 @@ def context_text(context): | |||||||
|  |  | ||||||
| def start_server_background(context): | def start_server_background(context): | ||||||
|     if os.name == 'nt': |     if os.name == 'nt': | ||||||
|         context.server_path = '../../../build/bin/Release/server.exe' |         context.server_path = '../../../build/bin/Release/llama-server.exe' | ||||||
|     else: |     else: | ||||||
|         context.server_path = '../../../build/bin/server' |         context.server_path = '../../../build/bin/llama-server' | ||||||
|     if 'LLAMA_SERVER_BIN_PATH' in os.environ: |     if 'LLAMA_SERVER_BIN_PATH' in os.environ: | ||||||
|         context.server_path = os.environ['LLAMA_SERVER_BIN_PATH'] |         context.server_path = os.environ['LLAMA_SERVER_BIN_PATH'] | ||||||
|     server_listen_addr = context.server_fqdn |     server_listen_addr = context.server_fqdn | ||||||
|   | |||||||
| @@ -1,6 +1,6 @@ | |||||||
| # GBNF Guide | # GBNF Guide | ||||||
|  |  | ||||||
| GBNF (GGML BNF) is a format for defining [formal grammars](https://en.wikipedia.org/wiki/Formal_grammar) to constrain model outputs in `llama.cpp`. For example, you can use it to force the model to generate valid JSON, or speak only in emojis. GBNF grammars are supported in various ways in `examples/main` and `examples/server`. | GBNF (GGML BNF) is a format for defining [formal grammars](https://en.wikipedia.org/wiki/Formal_grammar) to constrain model outputs in `llama.cpp`. For example, you can use it to force the model to generate valid JSON, or speak only in emojis. GBNF grammars are supported in various ways in `examples/main` and `examples/llama-server`. | ||||||
|  |  | ||||||
| ## Background | ## Background | ||||||
|  |  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Olivier Chafik
					Olivier Chafik