mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	docker : add support for CUDA in docker (#1461)
Co-authored-by: canardleteer <eris.has.a.dad+github@gmail.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
		
							
								
								
									
										33
									
								
								.devops/full-cuda.Dockerfile
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										33
									
								
								.devops/full-cuda.Dockerfile
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,33 @@ | |||||||
|  | ARG UBUNTU_VERSION=22.04 | ||||||
|  |  | ||||||
|  | # This needs to generally match the container host's environment. | ||||||
|  | ARG CUDA_VERSION=11.7.1 | ||||||
|  |  | ||||||
|  | # Target the CUDA build image | ||||||
|  | ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} | ||||||
|  |  | ||||||
|  | FROM ${BASE_CUDA_DEV_CONTAINER} as build | ||||||
|  |  | ||||||
|  | # Unless otherwise specified, we make a fat build. | ||||||
|  | ARG CUDA_DOCKER_ARCH=all | ||||||
|  |  | ||||||
|  | RUN apt-get update && \ | ||||||
|  |     apt-get install -y build-essential python3 python3-pip | ||||||
|  |  | ||||||
|  | COPY requirements.txt requirements.txt | ||||||
|  |  | ||||||
|  | RUN pip install --upgrade pip setuptools wheel \ | ||||||
|  |     && pip install -r requirements.txt | ||||||
|  |  | ||||||
|  | WORKDIR /app | ||||||
|  |  | ||||||
|  | COPY . . | ||||||
|  |  | ||||||
|  | # Set nvcc architecture | ||||||
|  | ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} | ||||||
|  | # Enable cuBLAS | ||||||
|  | ENV LLAMA_CUBLAS=1 | ||||||
|  |  | ||||||
|  | RUN make | ||||||
|  |  | ||||||
|  | ENTRYPOINT ["/app/.devops/tools.sh"] | ||||||
							
								
								
									
										32
									
								
								.devops/main-cuda.Dockerfile
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										32
									
								
								.devops/main-cuda.Dockerfile
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,32 @@ | |||||||
|  | ARG UBUNTU_VERSION=22.04 | ||||||
|  | # This needs to generally match the container host's environment. | ||||||
|  | ARG CUDA_VERSION=11.7.1 | ||||||
|  | # Target the CUDA build image | ||||||
|  | ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} | ||||||
|  | # Target the CUDA runtime image | ||||||
|  | ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} | ||||||
|  |  | ||||||
|  | FROM ${BASE_CUDA_DEV_CONTAINER} as build | ||||||
|  |  | ||||||
|  | # Unless otherwise specified, we make a fat build. | ||||||
|  | ARG CUDA_DOCKER_ARCH=all | ||||||
|  |  | ||||||
|  | RUN apt-get update && \ | ||||||
|  |     apt-get install -y build-essential | ||||||
|  |  | ||||||
|  | WORKDIR /app | ||||||
|  |  | ||||||
|  | COPY . . | ||||||
|  |  | ||||||
|  | # Set nvcc architecture | ||||||
|  | ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} | ||||||
|  | # Enable cuBLAS | ||||||
|  | ENV LLAMA_CUBLAS=1 | ||||||
|  |  | ||||||
|  | RUN make | ||||||
|  |  | ||||||
|  | FROM ${BASE_CUDA_RUN_CONTAINER} as runtime | ||||||
|  |  | ||||||
|  | COPY --from=build /app/main /main | ||||||
|  |  | ||||||
|  | ENTRYPOINT [ "/main" ] | ||||||
							
								
								
									
										8
									
								
								Makefile
									
									
									
									
									
								
							
							
						
						
									
										8
									
								
								Makefile
									
									
									
									
									
								
							| @@ -163,7 +163,12 @@ ifdef LLAMA_CUBLAS | |||||||
| 	LDFLAGS   += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib | 	LDFLAGS   += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib | ||||||
| 	OBJS      += ggml-cuda.o | 	OBJS      += ggml-cuda.o | ||||||
| 	NVCC      = nvcc | 	NVCC      = nvcc | ||||||
| 	NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native | 	NVCCFLAGS = --forward-unknown-to-host-compiler | ||||||
|  | ifdef CUDA_DOCKER_ARCH | ||||||
|  | 	NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH) | ||||||
|  | else | ||||||
|  | 	NVCCFLAGS += -arch=native | ||||||
|  | endif # CUDA_DOCKER_ARCH | ||||||
| ifdef LLAMA_CUDA_FORCE_DMMV | ifdef LLAMA_CUDA_FORCE_DMMV | ||||||
| 	NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV | 	NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV | ||||||
| endif # LLAMA_CUDA_FORCE_DMMV | endif # LLAMA_CUDA_FORCE_DMMV | ||||||
| @@ -187,6 +192,7 @@ ifdef LLAMA_CUDA_KQUANTS_ITER | |||||||
| else | else | ||||||
| 	NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2 | 	NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2 | ||||||
| endif | endif | ||||||
|  |  | ||||||
| ggml-cuda.o: ggml-cuda.cu ggml-cuda.h | ggml-cuda.o: ggml-cuda.cu ggml-cuda.h | ||||||
| 	$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@ | 	$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@ | ||||||
| endif # LLAMA_CUBLAS | endif # LLAMA_CUBLAS | ||||||
|   | |||||||
							
								
								
									
										32
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										32
									
								
								README.md
									
									
									
									
									
								
							| @@ -731,6 +731,38 @@ or with a light image: | |||||||
| docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 | docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 | ||||||
| ``` | ``` | ||||||
|  |  | ||||||
|  | ### Docker With CUDA | ||||||
|  |  | ||||||
|  | Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) properly installed on Linux, or is using a GPU enabled cloud, `cuBLAS` should be accessible inside the container. | ||||||
|  |  | ||||||
|  | #### Building Locally | ||||||
|  |  | ||||||
|  | ```bash | ||||||
|  | docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile . | ||||||
|  | docker build -t local/llama.cpp:light-cuda -f .devops/main-cuda.Dockerfile . | ||||||
|  | ``` | ||||||
|  |  | ||||||
|  | You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture. | ||||||
|  |  | ||||||
|  | The defaults are: | ||||||
|  |  | ||||||
|  | - `CUDA_VERSION` set to `11.7.1` | ||||||
|  | - `CUDA_DOCKER_ARCH` set to `all` | ||||||
|  |  | ||||||
|  | The resulting images, are essentially the same as the non-CUDA images: | ||||||
|  |  | ||||||
|  | 1. `local/llama.cpp:full-cuda`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. | ||||||
|  | 2. `local/llama.cpp:light-cuda`: This image only includes the main executable file. | ||||||
|  |  | ||||||
|  | #### Usage | ||||||
|  |  | ||||||
|  | After building locally, Usage is similar to the non-CUDA examples, but you'll need to add the `--gpus` flag. You will also want to use the `--n-gpu-layers` flag. | ||||||
|  |  | ||||||
|  | ```bash | ||||||
|  | docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1 | ||||||
|  | docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1 | ||||||
|  | ``` | ||||||
|  |  | ||||||
| ### Contributing | ### Contributing | ||||||
|  |  | ||||||
| - Contributors can open PRs | - Contributors can open PRs | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 dylan
					dylan