mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	docker : add support for CUDA in docker (#1461)
Co-authored-by: canardleteer <eris.has.a.dad+github@gmail.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
		
							
								
								
									
										33
									
								
								.devops/full-cuda.Dockerfile
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										33
									
								
								.devops/full-cuda.Dockerfile
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,33 @@ | ||||
| ARG UBUNTU_VERSION=22.04 | ||||
|  | ||||
| # This needs to generally match the container host's environment. | ||||
| ARG CUDA_VERSION=11.7.1 | ||||
|  | ||||
| # Target the CUDA build image | ||||
| ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} | ||||
|  | ||||
| FROM ${BASE_CUDA_DEV_CONTAINER} as build | ||||
|  | ||||
| # Unless otherwise specified, we make a fat build. | ||||
| ARG CUDA_DOCKER_ARCH=all | ||||
|  | ||||
| RUN apt-get update && \ | ||||
|     apt-get install -y build-essential python3 python3-pip | ||||
|  | ||||
| COPY requirements.txt requirements.txt | ||||
|  | ||||
| RUN pip install --upgrade pip setuptools wheel \ | ||||
|     && pip install -r requirements.txt | ||||
|  | ||||
| WORKDIR /app | ||||
|  | ||||
| COPY . . | ||||
|  | ||||
| # Set nvcc architecture | ||||
| ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} | ||||
| # Enable cuBLAS | ||||
| ENV LLAMA_CUBLAS=1 | ||||
|  | ||||
| RUN make | ||||
|  | ||||
| ENTRYPOINT ["/app/.devops/tools.sh"] | ||||
							
								
								
									
										32
									
								
								.devops/main-cuda.Dockerfile
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										32
									
								
								.devops/main-cuda.Dockerfile
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,32 @@ | ||||
| ARG UBUNTU_VERSION=22.04 | ||||
| # This needs to generally match the container host's environment. | ||||
| ARG CUDA_VERSION=11.7.1 | ||||
| # Target the CUDA build image | ||||
| ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} | ||||
| # Target the CUDA runtime image | ||||
| ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} | ||||
|  | ||||
| FROM ${BASE_CUDA_DEV_CONTAINER} as build | ||||
|  | ||||
| # Unless otherwise specified, we make a fat build. | ||||
| ARG CUDA_DOCKER_ARCH=all | ||||
|  | ||||
| RUN apt-get update && \ | ||||
|     apt-get install -y build-essential | ||||
|  | ||||
| WORKDIR /app | ||||
|  | ||||
| COPY . . | ||||
|  | ||||
| # Set nvcc architecture | ||||
| ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} | ||||
| # Enable cuBLAS | ||||
| ENV LLAMA_CUBLAS=1 | ||||
|  | ||||
| RUN make | ||||
|  | ||||
| FROM ${BASE_CUDA_RUN_CONTAINER} as runtime | ||||
|  | ||||
| COPY --from=build /app/main /main | ||||
|  | ||||
| ENTRYPOINT [ "/main" ] | ||||
							
								
								
									
										8
									
								
								Makefile
									
									
									
									
									
								
							
							
						
						
									
										8
									
								
								Makefile
									
									
									
									
									
								
							| @@ -163,7 +163,12 @@ ifdef LLAMA_CUBLAS | ||||
| 	LDFLAGS   += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib | ||||
| 	OBJS      += ggml-cuda.o | ||||
| 	NVCC      = nvcc | ||||
| 	NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native | ||||
| 	NVCCFLAGS = --forward-unknown-to-host-compiler | ||||
| ifdef CUDA_DOCKER_ARCH | ||||
| 	NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH) | ||||
| else | ||||
| 	NVCCFLAGS += -arch=native | ||||
| endif # CUDA_DOCKER_ARCH | ||||
| ifdef LLAMA_CUDA_FORCE_DMMV | ||||
| 	NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV | ||||
| endif # LLAMA_CUDA_FORCE_DMMV | ||||
| @@ -187,6 +192,7 @@ ifdef LLAMA_CUDA_KQUANTS_ITER | ||||
| else | ||||
| 	NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2 | ||||
| endif | ||||
|  | ||||
| ggml-cuda.o: ggml-cuda.cu ggml-cuda.h | ||||
| 	$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@ | ||||
| endif # LLAMA_CUBLAS | ||||
|   | ||||
							
								
								
									
										32
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										32
									
								
								README.md
									
									
									
									
									
								
							| @@ -731,6 +731,38 @@ or with a light image: | ||||
| docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 | ||||
| ``` | ||||
|  | ||||
| ### Docker With CUDA | ||||
|  | ||||
| Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) properly installed on Linux, or is using a GPU enabled cloud, `cuBLAS` should be accessible inside the container. | ||||
|  | ||||
| #### Building Locally | ||||
|  | ||||
| ```bash | ||||
| docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile . | ||||
| docker build -t local/llama.cpp:light-cuda -f .devops/main-cuda.Dockerfile . | ||||
| ``` | ||||
|  | ||||
| You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture. | ||||
|  | ||||
| The defaults are: | ||||
|  | ||||
| - `CUDA_VERSION` set to `11.7.1` | ||||
| - `CUDA_DOCKER_ARCH` set to `all` | ||||
|  | ||||
| The resulting images, are essentially the same as the non-CUDA images: | ||||
|  | ||||
| 1. `local/llama.cpp:full-cuda`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. | ||||
| 2. `local/llama.cpp:light-cuda`: This image only includes the main executable file. | ||||
|  | ||||
| #### Usage | ||||
|  | ||||
| After building locally, Usage is similar to the non-CUDA examples, but you'll need to add the `--gpus` flag. You will also want to use the `--n-gpu-layers` flag. | ||||
|  | ||||
| ```bash | ||||
| docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1 | ||||
| docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1 | ||||
| ``` | ||||
|  | ||||
| ### Contributing | ||||
|  | ||||
| - Contributors can open PRs | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 dylan
					dylan