mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	docker : add server-first container images (#5157)
* feat: add Dockerfiles for each platform that user ./server instead of ./main * feat: update .github/workflows/docker.yml to build server-first docker containers * doc: add information about running the server with Docker to README.md * doc: add information about running with docker to the server README * doc: update n-gpu-layers to show correct GPU usage * fix(doc): update container tag from `server` to `server-cuda` for README example on running server container with CUDA
This commit is contained in:
		
							
								
								
									
										32
									
								
								.devops/server-cuda.Dockerfile
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										32
									
								
								.devops/server-cuda.Dockerfile
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,32 @@ | |||||||
|  | ARG UBUNTU_VERSION=22.04 | ||||||
|  | # This needs to generally match the container host's environment. | ||||||
|  | ARG CUDA_VERSION=11.7.1 | ||||||
|  | # Target the CUDA build image | ||||||
|  | ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} | ||||||
|  | # Target the CUDA runtime image | ||||||
|  | ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} | ||||||
|  |  | ||||||
|  | FROM ${BASE_CUDA_DEV_CONTAINER} as build | ||||||
|  |  | ||||||
|  | # Unless otherwise specified, we make a fat build. | ||||||
|  | ARG CUDA_DOCKER_ARCH=all | ||||||
|  |  | ||||||
|  | RUN apt-get update && \ | ||||||
|  |     apt-get install -y build-essential git | ||||||
|  |  | ||||||
|  | WORKDIR /app | ||||||
|  |  | ||||||
|  | COPY . . | ||||||
|  |  | ||||||
|  | # Set nvcc architecture | ||||||
|  | ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} | ||||||
|  | # Enable cuBLAS | ||||||
|  | ENV LLAMA_CUBLAS=1 | ||||||
|  |  | ||||||
|  | RUN make | ||||||
|  |  | ||||||
|  | FROM ${BASE_CUDA_RUN_CONTAINER} as runtime | ||||||
|  |  | ||||||
|  | COPY --from=build /app/server /server | ||||||
|  |  | ||||||
|  | ENTRYPOINT [ "/server" ] | ||||||
							
								
								
									
										25
									
								
								.devops/server-intel.Dockerfile
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										25
									
								
								.devops/server-intel.Dockerfile
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,25 @@ | |||||||
|  | ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04 | ||||||
|  | ARG UBUNTU_VERSION=22.04 | ||||||
|  |  | ||||||
|  | FROM intel/hpckit:$ONEAPI_VERSION as build | ||||||
|  |  | ||||||
|  | RUN apt-get update && \ | ||||||
|  |     apt-get install -y git | ||||||
|  |  | ||||||
|  | WORKDIR /app | ||||||
|  |  | ||||||
|  | COPY . . | ||||||
|  |  | ||||||
|  | # for some reasons, "-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DLLAMA_NATIVE=ON" give worse performance | ||||||
|  | RUN mkdir build && \ | ||||||
|  |     cd build && \ | ||||||
|  |     cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx && \ | ||||||
|  |     cmake --build . --config Release --target main server | ||||||
|  |  | ||||||
|  | FROM ubuntu:$UBUNTU_VERSION as runtime | ||||||
|  |  | ||||||
|  | COPY --from=build /app/build/bin/server /server | ||||||
|  |  | ||||||
|  | ENV LC_ALL=C.utf8 | ||||||
|  |  | ||||||
|  | ENTRYPOINT [ "/server" ] | ||||||
							
								
								
									
										45
									
								
								.devops/server-rocm.Dockerfile
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										45
									
								
								.devops/server-rocm.Dockerfile
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,45 @@ | |||||||
|  | ARG UBUNTU_VERSION=22.04 | ||||||
|  |  | ||||||
|  | # This needs to generally match the container host's environment. | ||||||
|  | ARG ROCM_VERSION=5.6 | ||||||
|  |  | ||||||
|  | # Target the CUDA build image | ||||||
|  | ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete | ||||||
|  |  | ||||||
|  | FROM ${BASE_ROCM_DEV_CONTAINER} as build | ||||||
|  |  | ||||||
|  | # Unless otherwise specified, we make a fat build. | ||||||
|  | # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878 | ||||||
|  | # This is mostly tied to rocBLAS supported archs. | ||||||
|  | ARG ROCM_DOCKER_ARCH=\ | ||||||
|  |     gfx803 \ | ||||||
|  |     gfx900 \ | ||||||
|  |     gfx906 \ | ||||||
|  |     gfx908 \ | ||||||
|  |     gfx90a \ | ||||||
|  |     gfx1010 \ | ||||||
|  |     gfx1030 \ | ||||||
|  |     gfx1100 \ | ||||||
|  |     gfx1101 \ | ||||||
|  |     gfx1102 | ||||||
|  |  | ||||||
|  | COPY requirements.txt   requirements.txt | ||||||
|  | COPY requirements       requirements | ||||||
|  |  | ||||||
|  | RUN pip install --upgrade pip setuptools wheel \ | ||||||
|  |     && pip install -r requirements.txt | ||||||
|  |  | ||||||
|  | WORKDIR /app | ||||||
|  |  | ||||||
|  | COPY . . | ||||||
|  |  | ||||||
|  | # Set nvcc architecture | ||||||
|  | ENV GPU_TARGETS=${ROCM_DOCKER_ARCH} | ||||||
|  | # Enable ROCm | ||||||
|  | ENV LLAMA_HIPBLAS=1 | ||||||
|  | ENV CC=/opt/rocm/llvm/bin/clang | ||||||
|  | ENV CXX=/opt/rocm/llvm/bin/clang++ | ||||||
|  |  | ||||||
|  | RUN make | ||||||
|  |  | ||||||
|  | ENTRYPOINT [ "/app/server" ] | ||||||
							
								
								
									
										20
									
								
								.devops/server.Dockerfile
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										20
									
								
								.devops/server.Dockerfile
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,20 @@ | |||||||
|  | ARG UBUNTU_VERSION=22.04 | ||||||
|  |  | ||||||
|  | FROM ubuntu:$UBUNTU_VERSION as build | ||||||
|  |  | ||||||
|  | RUN apt-get update && \ | ||||||
|  |     apt-get install -y build-essential git | ||||||
|  |  | ||||||
|  | WORKDIR /app | ||||||
|  |  | ||||||
|  | COPY . . | ||||||
|  |  | ||||||
|  | RUN make | ||||||
|  |  | ||||||
|  | FROM ubuntu:$UBUNTU_VERSION as runtime | ||||||
|  |  | ||||||
|  | COPY --from=build /app/server /server | ||||||
|  |  | ||||||
|  | ENV LC_ALL=C.utf8 | ||||||
|  |  | ||||||
|  | ENTRYPOINT [ "/server" ] | ||||||
							
								
								
									
										4
									
								
								.github/workflows/docker.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										4
									
								
								.github/workflows/docker.yml
									
									
									
									
										vendored
									
									
								
							| @@ -28,14 +28,18 @@ jobs: | |||||||
|         config: |         config: | ||||||
|           - { tag: "light", dockerfile: ".devops/main.Dockerfile", platforms: "linux/amd64,linux/arm64" } |           - { tag: "light", dockerfile: ".devops/main.Dockerfile", platforms: "linux/amd64,linux/arm64" } | ||||||
|           - { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" } |           - { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" } | ||||||
|  |           - { tag: "server", dockerfile: ".devops/server.Dockerfile", platforms: "linux/amd64,linux/arm64" } | ||||||
|           # NOTE(canardletter): The CUDA builds on arm64 are very slow, so I |           # NOTE(canardletter): The CUDA builds on arm64 are very slow, so I | ||||||
|           #                     have disabled them for now until the reason why |           #                     have disabled them for now until the reason why | ||||||
|           #                     is understood. |           #                     is understood. | ||||||
|           - { tag: "light-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platforms: "linux/amd64" } |           - { tag: "light-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platforms: "linux/amd64" } | ||||||
|           - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" } |           - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" } | ||||||
|  |           - { tag: "server-cuda", dockerfile: ".devops/server-cuda.Dockerfile", platforms: "linux/amd64" } | ||||||
|           - { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } |           - { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } | ||||||
|           - { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } |           - { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } | ||||||
|  |           - { tag: "server-rocm", dockerfile: ".devops/server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } | ||||||
|           - { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" } |           - { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" } | ||||||
|  |           - { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" } | ||||||
|     steps: |     steps: | ||||||
|       - name: Check out the repo |       - name: Check out the repo | ||||||
|         uses: actions/checkout@v3 |         uses: actions/checkout@v3 | ||||||
|   | |||||||
							
								
								
									
										14
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										14
									
								
								README.md
									
									
									
									
									
								
							| @@ -931,17 +931,20 @@ Place your desired model into the `~/llama.cpp/models/` directory and execute th | |||||||
| * Create a folder to store big models & intermediate files (ex. /llama/models) | * Create a folder to store big models & intermediate files (ex. /llama/models) | ||||||
|  |  | ||||||
| #### Images | #### Images | ||||||
| We have two Docker images available for this project: | We have three Docker images available for this project: | ||||||
|  |  | ||||||
| 1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`) | 1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`) | ||||||
| 2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`) | 2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`) | ||||||
|  | 3. `ghcr.io/ggerganov/llama.cpp:server`: This image only includes the server executabhle file. (platforms: `linux/amd64`, `linux/arm64`) | ||||||
|  |  | ||||||
| Additionally, there the following images, similar to the above: | Additionally, there the following images, similar to the above: | ||||||
|  |  | ||||||
| - `ghcr.io/ggerganov/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`) | - `ghcr.io/ggerganov/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`) | ||||||
| - `ghcr.io/ggerganov/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`) | - `ghcr.io/ggerganov/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`) | ||||||
|  | - `ghcr.io/ggerganov/llama.cpp:server-cuda`: Same as `server` but compiled with CUDA support. (platforms: `linux/amd64`) | ||||||
| - `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) | - `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) | ||||||
| - `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) | - `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) | ||||||
|  | - `ghcr.io/ggerganov/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`) | ||||||
|  |  | ||||||
| The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now). | The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now). | ||||||
|  |  | ||||||
| @@ -967,6 +970,12 @@ or with a light image: | |||||||
| docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 | docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 | ||||||
| ``` | ``` | ||||||
|  |  | ||||||
|  | or with a server image: | ||||||
|  |  | ||||||
|  | ```bash | ||||||
|  | docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggerganov/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 | ||||||
|  | ``` | ||||||
|  |  | ||||||
| ### Docker With CUDA | ### Docker With CUDA | ||||||
|  |  | ||||||
| Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) properly installed on Linux, or is using a GPU enabled cloud, `cuBLAS` should be accessible inside the container. | Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) properly installed on Linux, or is using a GPU enabled cloud, `cuBLAS` should be accessible inside the container. | ||||||
| @@ -976,6 +985,7 @@ Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia | |||||||
| ```bash | ```bash | ||||||
| docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile . | docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile . | ||||||
| docker build -t local/llama.cpp:light-cuda -f .devops/main-cuda.Dockerfile . | docker build -t local/llama.cpp:light-cuda -f .devops/main-cuda.Dockerfile . | ||||||
|  | docker build -t local/llama.cpp:server-cuda -f .devops/server-cuda.Dockerfile . | ||||||
| ``` | ``` | ||||||
|  |  | ||||||
| You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture. | You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture. | ||||||
| @@ -989,6 +999,7 @@ The resulting images, are essentially the same as the non-CUDA images: | |||||||
|  |  | ||||||
| 1. `local/llama.cpp:full-cuda`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. | 1. `local/llama.cpp:full-cuda`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. | ||||||
| 2. `local/llama.cpp:light-cuda`: This image only includes the main executable file. | 2. `local/llama.cpp:light-cuda`: This image only includes the main executable file. | ||||||
|  | 3. `local/llama.cpp:server-cuda`: This image only includes the server executable file. | ||||||
|  |  | ||||||
| #### Usage | #### Usage | ||||||
|  |  | ||||||
| @@ -997,6 +1008,7 @@ After building locally, Usage is similar to the non-CUDA examples, but you'll ne | |||||||
| ```bash | ```bash | ||||||
| docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1 | docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1 | ||||||
| docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1 | docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1 | ||||||
|  | docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1 | ||||||
| ``` | ``` | ||||||
|  |  | ||||||
| ### Contributing | ### Contributing | ||||||
|   | |||||||
| @@ -66,6 +66,14 @@ server.exe -m models\7B\ggml-model.gguf -c 2048 | |||||||
| The above command will start a server that by default listens on `127.0.0.1:8080`. | The above command will start a server that by default listens on `127.0.0.1:8080`. | ||||||
| You can consume the endpoints with Postman or NodeJS with axios library. You can visit the web front end at the same url. | You can consume the endpoints with Postman or NodeJS with axios library. You can visit the web front end at the same url. | ||||||
|  |  | ||||||
|  | ### Docker: | ||||||
|  | ```bash | ||||||
|  | docker run -p 8080:8080 -v /path/to/models:/models ggerganov/llama.cpp:server -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080 | ||||||
|  |  | ||||||
|  | # or, with CUDA: | ||||||
|  | docker run -p 8080:8080 -v /path/to/models:/models --gpus all ggerganov/llama.cpp:server-cuda -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080 --n-gpu-layers 99 | ||||||
|  | ``` | ||||||
|  |  | ||||||
| ## Testing with CURL | ## Testing with CURL | ||||||
|  |  | ||||||
| Using [curl](https://curl.se/). On Windows `curl.exe` should be available in the base OS. | Using [curl](https://curl.se/). On Windows `curl.exe` should be available in the base OS. | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Kyle Mistele
					Kyle Mistele