mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	docker : add build for SYCL, Vulkan + update readme (#5228)
* add vulkan dockerfile * intel dockerfile: compile sycl by default * fix vulkan dockerfile * add docs for vulkan * docs: sycl build in docker * docs: remove trailing spaces * docs: sycl: add docker section * docs: clarify install vulkan SDK outside docker * sycl: use intel/oneapi-basekit docker image * docs: correct TOC * docs: correct docker image for Intel oneMKL
This commit is contained in:
		| @@ -1,8 +1,8 @@ | ||||
| ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04 | ||||
| ARG UBUNTU_VERSION=22.04 | ||||
|  | ||||
| FROM intel/hpckit:$ONEAPI_VERSION as build | ||||
| FROM intel/oneapi-basekit:$ONEAPI_VERSION as build | ||||
|  | ||||
| ARG LLAMA_SYCL_F16=OFF | ||||
| RUN apt-get update && \ | ||||
|     apt-get install -y git | ||||
|  | ||||
| @@ -10,16 +10,18 @@ WORKDIR /app | ||||
|  | ||||
| COPY . . | ||||
|  | ||||
| # for some reasons, "-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DLLAMA_NATIVE=ON" give worse performance | ||||
| RUN mkdir build && \ | ||||
|     cd build && \ | ||||
|     cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx && \ | ||||
|     cmake --build . --config Release --target main server | ||||
|     if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \ | ||||
|         echo "LLAMA_SYCL_F16 is set" && \ | ||||
|         export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \ | ||||
|     fi && \ | ||||
|     cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \ | ||||
|     cmake --build . --config Release --target main | ||||
|  | ||||
| FROM ubuntu:$UBUNTU_VERSION as runtime | ||||
| FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime | ||||
|  | ||||
| COPY --from=build /app/build/bin/main /main | ||||
| COPY --from=build /app/build/bin/server /server | ||||
|  | ||||
| ENV LC_ALL=C.utf8 | ||||
|  | ||||
|   | ||||
							
								
								
									
										29
									
								
								.devops/main-vulkan.Dockerfile
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										29
									
								
								.devops/main-vulkan.Dockerfile
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,29 @@ | ||||
| ARG UBUNTU_VERSION=jammy | ||||
|  | ||||
| FROM ubuntu:$UBUNTU_VERSION as build | ||||
|  | ||||
| # Install build tools | ||||
| RUN apt update && apt install -y git build-essential cmake wget | ||||
|  | ||||
| # Install Vulkan SDK | ||||
| RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \ | ||||
|     wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \ | ||||
|     apt update -y && \ | ||||
|     apt-get install -y vulkan-sdk | ||||
|  | ||||
| # Build it | ||||
| WORKDIR /app | ||||
| COPY . . | ||||
| RUN mkdir build && \ | ||||
|     cd build && \ | ||||
|     cmake .. -DLLAMA_VULKAN=1 && \ | ||||
|     cmake --build . --config Release --target main | ||||
|  | ||||
| # Clean up | ||||
| WORKDIR / | ||||
| RUN cp /app/build/bin/main /main && \ | ||||
|     rm -rf /app | ||||
|  | ||||
| ENV LC_ALL=C.utf8 | ||||
|  | ||||
| ENTRYPOINT [ "/main" ] | ||||
| @@ -1,8 +1,8 @@ | ||||
| ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04 | ||||
| ARG UBUNTU_VERSION=22.04 | ||||
|  | ||||
| FROM intel/hpckit:$ONEAPI_VERSION as build | ||||
| FROM intel/oneapi-basekit:$ONEAPI_VERSION as build | ||||
|  | ||||
| ARG LLAMA_SYCL_F16=OFF | ||||
| RUN apt-get update && \ | ||||
|     apt-get install -y git | ||||
|  | ||||
| @@ -10,13 +10,16 @@ WORKDIR /app | ||||
|  | ||||
| COPY . . | ||||
|  | ||||
| # for some reasons, "-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DLLAMA_NATIVE=ON" give worse performance | ||||
| RUN mkdir build && \ | ||||
|     cd build && \ | ||||
|     cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx && \ | ||||
|     cmake --build . --config Release --target main server | ||||
|     if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \ | ||||
|         echo "LLAMA_SYCL_F16 is set" && \ | ||||
|         export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \ | ||||
|     fi && \ | ||||
|     cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \ | ||||
|     cmake --build . --config Release --target server | ||||
|  | ||||
| FROM ubuntu:$UBUNTU_VERSION as runtime | ||||
| FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime | ||||
|  | ||||
| COPY --from=build /app/build/bin/server /server | ||||
|  | ||||
|   | ||||
							
								
								
									
										29
									
								
								.devops/server-vulkan.Dockerfile
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										29
									
								
								.devops/server-vulkan.Dockerfile
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,29 @@ | ||||
| ARG UBUNTU_VERSION=jammy | ||||
|  | ||||
| FROM ubuntu:$UBUNTU_VERSION as build | ||||
|  | ||||
| # Install build tools | ||||
| RUN apt update && apt install -y git build-essential cmake wget | ||||
|  | ||||
| # Install Vulkan SDK | ||||
| RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \ | ||||
|     wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \ | ||||
|     apt update -y && \ | ||||
|     apt-get install -y vulkan-sdk | ||||
|  | ||||
| # Build it | ||||
| WORKDIR /app | ||||
| COPY . . | ||||
| RUN mkdir build && \ | ||||
|     cd build && \ | ||||
|     cmake .. -DLLAMA_VULKAN=1 && \ | ||||
|     cmake --build . --config Release --target server | ||||
|  | ||||
| # Clean up | ||||
| WORKDIR / | ||||
| RUN cp /app/build/bin/server /server && \ | ||||
|     rm -rf /app | ||||
|  | ||||
| ENV LC_ALL=C.utf8 | ||||
|  | ||||
| ENTRYPOINT [ "/server" ] | ||||
							
								
								
									
										102
									
								
								README-sycl.md
									
									
									
									
									
								
							
							
						
						
									
										102
									
								
								README-sycl.md
									
									
									
									
									
								
							| @@ -1,22 +1,15 @@ | ||||
| # llama.cpp for SYCL | ||||
|  | ||||
| [Background](#background) | ||||
|  | ||||
| [OS](#os) | ||||
|  | ||||
| [Intel GPU](#intel-gpu) | ||||
|  | ||||
| [Linux](#linux) | ||||
|  | ||||
| [Windows](#windows) | ||||
|  | ||||
| [Environment Variable](#environment-variable) | ||||
|  | ||||
| [Known Issue](#known-issue) | ||||
|  | ||||
| [Q&A](#q&a) | ||||
|  | ||||
| [Todo](#todo) | ||||
| - [Background](#background) | ||||
| - [OS](#os) | ||||
| - [Intel GPU](#intel-gpu) | ||||
| - [Docker](#docker) | ||||
| - [Linux](#linux) | ||||
| - [Windows](#windows) | ||||
| - [Environment Variable](#environment-variable) | ||||
| - [Known Issue](#known-issue) | ||||
| - [Q&A](#q&a) | ||||
| - [Todo](#todo) | ||||
|  | ||||
| ## Background | ||||
|  | ||||
| @@ -36,7 +29,7 @@ For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building). | ||||
|  | ||||
| |OS|Status|Verified| | ||||
| |-|-|-| | ||||
| |Linux|Support|Ubuntu 22.04| | ||||
| |Linux|Support|Ubuntu 22.04, Fedora Silverblue 39| | ||||
| |Windows|Support|Windows 11| | ||||
|  | ||||
|  | ||||
| @@ -50,7 +43,7 @@ For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building). | ||||
| |Intel Data Center Flex Series| Support| Flex 170| | ||||
| |Intel Arc Series| Support| Arc 770, 730M| | ||||
| |Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake| | ||||
| |Intel iGPU| Support| iGPU in i5-1250P, i7-1165G7| | ||||
| |Intel iGPU| Support| iGPU in i5-1250P, i7-1260P, i7-1165G7| | ||||
|  | ||||
| Note: If the EUs (Execution Unit) in iGPU is less than 80, the inference speed will be too slow to use. | ||||
|  | ||||
| @@ -64,6 +57,38 @@ For iGPU, please make sure the shared memory from host memory is enough. For lla | ||||
|  | ||||
| For dGPU, please make sure the device memory is enough. For llama-2-7b.Q4_0, recommend the device memory is 4GB+. | ||||
|  | ||||
| ## Docker | ||||
|  | ||||
| Note: | ||||
| - Only docker on Linux is tested. Docker on WSL may not work. | ||||
| - You may need to install Intel GPU driver on the host machine (See the [Linux](#linux) section to know how to do that) | ||||
|  | ||||
| ### Build the image | ||||
|  | ||||
| You can choose between **F16** and **F32** build. F16 is faster for long-prompt inference. | ||||
|  | ||||
|  | ||||
| ```sh | ||||
| # For F16: | ||||
| #docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/main-intel.Dockerfile . | ||||
|  | ||||
| # Or, for F32: | ||||
| docker build -t llama-cpp-sycl -f .devops/main-intel.Dockerfile . | ||||
|  | ||||
| # Note: you can also use the ".devops/main-server.Dockerfile", which compiles the "server" example | ||||
| ``` | ||||
|  | ||||
| ### Run | ||||
|  | ||||
| ```sh | ||||
| # Firstly, find all the DRI cards: | ||||
| ls -la /dev/dri | ||||
| # Then, pick the card that you want to use. | ||||
|  | ||||
| # For example with "/dev/dri/card1" | ||||
| docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-sycl -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 | ||||
| ``` | ||||
|  | ||||
| ## Linux | ||||
|  | ||||
| ### Setup Environment | ||||
| @@ -76,7 +101,7 @@ Note: for iGPU, please install the client GPU driver. | ||||
|  | ||||
| b. Add user to group: video, render. | ||||
|  | ||||
| ``` | ||||
| ```sh | ||||
| sudo usermod -aG render username | ||||
| sudo usermod -aG video username | ||||
| ``` | ||||
| @@ -85,7 +110,7 @@ Note: re-login to enable it. | ||||
|  | ||||
| c. Check | ||||
|  | ||||
| ``` | ||||
| ```sh | ||||
| sudo apt install clinfo | ||||
| sudo clinfo -l | ||||
| ``` | ||||
| @@ -103,7 +128,6 @@ Platform #0: Intel(R) OpenCL HD Graphics | ||||
|  | ||||
| 2. Install Intel® oneAPI Base toolkit. | ||||
|  | ||||
|  | ||||
| a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html). | ||||
|  | ||||
| Recommend to install to default folder: **/opt/intel/oneapi**. | ||||
| @@ -112,7 +136,7 @@ Following guide use the default folder as example. If you use other folder, plea | ||||
|  | ||||
| b. Check | ||||
|  | ||||
| ``` | ||||
| ```sh | ||||
| source /opt/intel/oneapi/setvars.sh | ||||
|  | ||||
| sycl-ls | ||||
| @@ -131,21 +155,25 @@ Output (example): | ||||
|  | ||||
| 2. Build locally: | ||||
|  | ||||
| ``` | ||||
| Note: | ||||
| - You can choose between **F16** and **F32** build. F16 is faster for long-prompt inference. | ||||
| - By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only. | ||||
|  | ||||
| ```sh | ||||
| mkdir -p build | ||||
| cd build | ||||
| source /opt/intel/oneapi/setvars.sh | ||||
|  | ||||
| #for FP16 | ||||
| #cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON # faster for long-prompt inference | ||||
| # For FP16: | ||||
| #cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON | ||||
|  | ||||
| #for FP32 | ||||
| # Or, for FP32: | ||||
| cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx | ||||
|  | ||||
| #build example/main only | ||||
| # Build example/main only | ||||
| #cmake --build . --config Release --target main | ||||
|  | ||||
| #build all binary | ||||
| # Or, build all binary | ||||
| cmake --build . --config Release -v | ||||
|  | ||||
| cd .. | ||||
| @@ -153,14 +181,10 @@ cd .. | ||||
|  | ||||
| or | ||||
|  | ||||
| ``` | ||||
| ```sh | ||||
| ./examples/sycl/build.sh | ||||
| ``` | ||||
|  | ||||
| Note: | ||||
|  | ||||
| - By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only. | ||||
|  | ||||
| ### Run | ||||
|  | ||||
| 1. Put model file to folder **models** | ||||
| @@ -177,10 +201,10 @@ source /opt/intel/oneapi/setvars.sh | ||||
|  | ||||
| Run without parameter: | ||||
|  | ||||
| ``` | ||||
| ```sh | ||||
| ./build/bin/ls-sycl-device | ||||
|  | ||||
| or | ||||
| # or running the "main" executable and look at the output log: | ||||
|  | ||||
| ./build/bin/main | ||||
| ``` | ||||
| @@ -209,13 +233,13 @@ found 4 SYCL devices: | ||||
|  | ||||
| Set device ID = 0 by **GGML_SYCL_DEVICE=0** | ||||
|  | ||||
| ``` | ||||
| ```sh | ||||
| GGML_SYCL_DEVICE=0 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 | ||||
| ``` | ||||
| or run by script: | ||||
|  | ||||
| ``` | ||||
| ./examples/sycl/run-llama2.sh | ||||
| ```sh | ||||
| ./examples/sycl/run_llama2.sh | ||||
| ``` | ||||
|  | ||||
| Note: | ||||
|   | ||||
							
								
								
									
										64
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										64
									
								
								README.md
									
									
									
									
									
								
							| @@ -393,28 +393,28 @@ Building the program with BLAS support may lead to some performance improvements | ||||
|  | ||||
|   Check [BLIS.md](docs/BLIS.md) for more information. | ||||
|  | ||||
| - #### SYCL | ||||
|   SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators. | ||||
|  | ||||
|   llama.cpp based on SYCL is used to **support Intel GPU** (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU). | ||||
|  | ||||
|   For detailed info, please refer to [llama.cpp for SYCL](README-sycl.md). | ||||
|  | ||||
| - #### Intel oneMKL | ||||
|   Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. Please note that this build config **does not support Intel GPU**. For Intel GPU support, please refer to [llama.cpp for SYCL](./README-sycl.md). | ||||
|  | ||||
|   - Using manual oneAPI installation: | ||||
|     By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps: | ||||
|       ```bash | ||||
|       mkdir build | ||||
|       cd build | ||||
|       source /opt/intel/oneapi/setvars.sh # You can skip this step if  in oneapi-runtime docker image, only required for manual installation | ||||
|       source /opt/intel/oneapi/setvars.sh # You can skip this step if  in oneapi-basekit docker image, only required for manual installation | ||||
|       cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON | ||||
|       cmake --build . --config Release | ||||
|       ``` | ||||
|  | ||||
|   - Using oneAPI docker image: | ||||
|     If you do not want to source the environment vars and install oneAPI manually, you can also build the code using intel docker container: [oneAPI-runtime](https://hub.docker.com/r/intel/oneapi-runtime) | ||||
|  | ||||
|       ```bash | ||||
|       mkdir build | ||||
|       cd build | ||||
|       cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON | ||||
|       cmake --build . --config Release | ||||
|       ``` | ||||
|  | ||||
|   Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. | ||||
|     If you do not want to source the environment vars and install oneAPI manually, you can also build the code using intel docker container: [oneAPI-basekit](https://hub.docker.com/r/intel/oneapi-basekit). Then, you can use the commands given above. | ||||
|  | ||||
|   Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information. | ||||
|  | ||||
| @@ -601,14 +601,48 @@ Building the program with BLAS support may lead to some performance improvements | ||||
|  | ||||
|   You can get a list of platforms and devices from the `clinfo -l` command, etc. | ||||
|  | ||||
| - #### SYCL | ||||
| - #### Vulkan | ||||
|  | ||||
|   SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators. | ||||
|   **With docker**: | ||||
|  | ||||
|   llama.cpp based on SYCL is used to support Intel GPU (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU). | ||||
|   You don't need to install Vulkan SDK. It will be installed inside the container. | ||||
|  | ||||
|   For detailed info, please refer to [llama.cpp for SYCL](README-sycl.md). | ||||
|   ```sh | ||||
|   # Build the image | ||||
|   docker build -t llama-cpp-vulkan -f .devops/main-vulkan.Dockerfile . | ||||
|  | ||||
|   # Then, use it: | ||||
|   docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 | ||||
|   ``` | ||||
|  | ||||
|   **Without docker**: | ||||
|  | ||||
|   Firstly, you need to make sure you installed [Vulkan SDK](https://vulkan.lunarg.com/doc/view/latest/linux/getting_started_ubuntu.html) | ||||
|  | ||||
|   For example, on Ubuntu 22.04 (jammy), use the command below: | ||||
|  | ||||
|   ```bash | ||||
|   wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - | ||||
|   wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list | ||||
|   apt update -y | ||||
|   apt-get install -y vulkan-sdk | ||||
|   # To verify the installation, use the command below: | ||||
|   vulkaninfo | ||||
|   ``` | ||||
|  | ||||
|   Then, build llama.cpp using the cmake command below: | ||||
|  | ||||
|   ```bash | ||||
|   mkdir -p build | ||||
|   cd build | ||||
|   cmake .. -DLLAMA_VULKAN=1 | ||||
|   cmake --build . --config Release | ||||
|   # Test the output binary (with "-ngl 33" to offload all layers to GPU) | ||||
|   ./bin/main -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4 | ||||
|  | ||||
|   # You should see in the output, ggml_vulkan detected your GPU. For example: | ||||
|   # ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32 | ||||
|   ``` | ||||
|  | ||||
| ### Prepare Data & Run | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Xuan Son Nguyen
					Xuan Son Nguyen