mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	Introducing experimental OpenCL backend with support for Qualcomm Adreno GPUs (#10693)
* [cl][adreno] Add Adreno GPU support Add new OpenCL backend to support Adreno GPUs --------- Co-authored-by: Skyler Szot <quic_sszot@quicinc.com> Co-authored-by: Shangqing Gu <quic_shawngu@quicinc.com> Co-authored-by: Alexander Angus <quic_aangus@quicinc.com> Co-authored-by: Hongqiang Wang <quic_wangh@quicinc.com> Co-authored-by: Max Krasnyansky <quic_maxk@quicinc.com> * [cl][ci] Add workflow for CL * [cl][adreno] Fix memory leak for non SMALL_ALLOC path * opencl: integrate backend dyn.load interface and fix compiler and format warnings * opencl: remove small-alloc support and fix build errors for non-opencl platforms * opencl: fixed merge conflict (MUSA added twice in cmake) * opencl-ci: use RUNNER_TEMP instead of github.workspace * opencl: fix embed tool invocation with python3 * opencl: CI workflow fixes * opencl: Clean up small-alloc in CMake files * opencl: cleanup ggml-opencl2 header file * opencl: use ulong for offsets and strides in ADD kernel * opencl: use cl_ulong for all offsets * opencl: use cl_ulong for sizes and strides * opencl: use `GGML_LOG_xxx` instead of `fprintf(stderr, ...)` * opencl: rename backend `opencl2` -> `opencl` * opencl: rename kernel files `ggml-opencl2` -> `ggml-opencl` * opencl: make OpenCL required, remove redundant lib and inc directories * `ggml-base`, `..` and `.` are added by `ggml_add_backend_library` * opencl: rename backend - funcs, structs, etc `opencl2` -> `opencl` * opencl: remove copyright marker since main license already covers * opencl: replace some more OPENCL2 leftovers * opencl: remove limits on `tensor_extra` * opencl: use pools for `tensor_extra` * opencl: fix compiler warnings with GCC and Clang Still getting the warning about clCreateCmdQueue being obsolete. Will fix that separately. * opencl: fail gracefully if opencl devices are not available Also for unsupported GPUs. * opencl: fix MSVC builds (string length error) * opencl: check for various requirements, allow deprecated API * opencl: update log message for unsupported GPUs --------- Co-authored-by: Skyler Szot <quic_sszot@quicinc.com> Co-authored-by: Shangqing Gu <quic_shawngu@quicinc.com> Co-authored-by: Alexander Angus <quic_aangus@quicinc.com> Co-authored-by: Hongqiang Wang <quic_wangh@quicinc.com> Co-authored-by: Max Krasnyansky <quic_maxk@quicinc.com>
This commit is contained in:
		
							
								
								
									
										26
									
								
								.github/workflows/build.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										26
									
								
								.github/workflows/build.yml
									
									
									
									
										vendored
									
									
								
							| @@ -662,6 +662,8 @@ jobs: | |||||||
|             defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' |             defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' | ||||||
|           - build: 'msvc-arm64' |           - build: 'msvc-arm64' | ||||||
|             defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' |             defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' | ||||||
|  |           - build: 'llvm-arm64-opencl-adreno' | ||||||
|  |             defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON' | ||||||
|  |  | ||||||
|     steps: |     steps: | ||||||
|       - name: Clone |       - name: Clone | ||||||
| @@ -703,6 +705,28 @@ jobs: | |||||||
|         run: | |         run: | | ||||||
|           choco install ninja |           choco install ninja | ||||||
|  |  | ||||||
|  |       - name: Install OpenCL Headers and Libs | ||||||
|  |         id: install_opencl | ||||||
|  |         if: ${{ matrix.build == 'llvm-arm64-opencl-adreno' }} | ||||||
|  |         run: | | ||||||
|  |           git clone https://github.com/KhronosGroup/OpenCL-Headers | ||||||
|  |           cd OpenCL-Headers | ||||||
|  |           mkdir build && cd build | ||||||
|  |           cmake .. ` | ||||||
|  |             -DBUILD_TESTING=OFF ` | ||||||
|  |             -DOPENCL_HEADERS_BUILD_TESTING=OFF ` | ||||||
|  |             -DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF ` | ||||||
|  |             -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release" | ||||||
|  |           cmake --build . --target install | ||||||
|  |           git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader | ||||||
|  |           cd OpenCL-ICD-Loader | ||||||
|  |           mkdir build-arm64-release && cd build-arm64-release | ||||||
|  |           cmake .. ` | ||||||
|  |             -A arm64 ` | ||||||
|  |             -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" ` | ||||||
|  |             -DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release" | ||||||
|  |           cmake --build . --target install --config release | ||||||
|  |  | ||||||
|       - name: Build |       - name: Build | ||||||
|         id: cmake_build |         id: cmake_build | ||||||
|         run: | |         run: | | ||||||
| @@ -732,7 +756,7 @@ jobs: | |||||||
|       - name: Test |       - name: Test | ||||||
|         id: cmake_test |         id: cmake_test | ||||||
|         # not all machines have native AVX-512 |         # not all machines have native AVX-512 | ||||||
|         if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }} |         if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'llvm-arm64-opencl-adreno' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }} | ||||||
|         run: | |         run: | | ||||||
|           cd build |           cd build | ||||||
|           ctest -L main -C Release --verbose --timeout 900 |           ctest -L main -C Release --verbose --timeout 900 | ||||||
|   | |||||||
| @@ -179,6 +179,11 @@ set   (GGML_SYCL_TARGET "INTEL" CACHE STRING | |||||||
| set   (GGML_SYCL_DEVICE_ARCH "" CACHE STRING | set   (GGML_SYCL_DEVICE_ARCH "" CACHE STRING | ||||||
|                                             "ggml: sycl device architecture") |                                             "ggml: sycl device architecture") | ||||||
|  |  | ||||||
|  | option(GGML_OPENCL                          "ggml: use OpenCL"                                OFF) | ||||||
|  | option(GGML_OPENCL_PROFILING                "ggml: use OpenCL profiling (increases overhead)" OFF) | ||||||
|  | option(GGML_OPENCL_EMBED_KERNELS            "ggml: embed kernels"                             ON) | ||||||
|  | option(GGML_OPENCL_USE_ADRENO_KERNELS       "ggml: use optimized kernels for Adreno"          ON) | ||||||
|  |  | ||||||
| # extra artifacts | # extra artifacts | ||||||
| option(GGML_BUILD_TESTS    "ggml: build tests"    ${GGML_STANDALONE}) | option(GGML_BUILD_TESTS    "ggml: build tests"    ${GGML_STANDALONE}) | ||||||
| option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE}) | option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE}) | ||||||
|   | |||||||
							
								
								
									
										26
									
								
								ggml/include/ggml-opencl.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										26
									
								
								ggml/include/ggml-opencl.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,26 @@ | |||||||
|  | #ifndef GGML_OPENCL_H | ||||||
|  | #define GGML_OPENCL_H | ||||||
|  |  | ||||||
|  | #include "ggml.h" | ||||||
|  | #include "ggml-backend.h" | ||||||
|  |  | ||||||
|  | #ifdef  __cplusplus | ||||||
|  | extern "C" { | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  | // | ||||||
|  | // backend API | ||||||
|  | // | ||||||
|  | GGML_BACKEND_API ggml_backend_t ggml_backend_opencl_init(void); | ||||||
|  | GGML_BACKEND_API bool ggml_backend_is_opencl(ggml_backend_t backend); | ||||||
|  |  | ||||||
|  | GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type(void); | ||||||
|  | GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type(void); | ||||||
|  |  | ||||||
|  | GGML_BACKEND_API ggml_backend_reg_t ggml_backend_opencl_reg(void); | ||||||
|  |  | ||||||
|  | #ifdef  __cplusplus | ||||||
|  | } | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  | #endif // GGML_OPENCL_H | ||||||
| @@ -308,6 +308,7 @@ ggml_add_backend(MUSA) | |||||||
| ggml_add_backend(RPC) | ggml_add_backend(RPC) | ||||||
| ggml_add_backend(SYCL) | ggml_add_backend(SYCL) | ||||||
| ggml_add_backend(Vulkan) | ggml_add_backend(Vulkan) | ||||||
|  | ggml_add_backend(OpenCL) | ||||||
|  |  | ||||||
| foreach (target ggml-base ggml) | foreach (target ggml-base ggml) | ||||||
|     target_include_directories(${target} PUBLIC    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>) |     target_include_directories(${target} PUBLIC    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>) | ||||||
|   | |||||||
| @@ -46,6 +46,10 @@ | |||||||
| #include "ggml-vulkan.h" | #include "ggml-vulkan.h" | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|  | #ifdef GGML_USE_OPENCL | ||||||
|  | #include "ggml-opencl.h" | ||||||
|  | #endif | ||||||
|  |  | ||||||
| #ifdef GGML_USE_BLAS | #ifdef GGML_USE_BLAS | ||||||
| #include "ggml-blas.h" | #include "ggml-blas.h" | ||||||
| #endif | #endif | ||||||
| @@ -146,6 +150,9 @@ struct ggml_backend_registry { | |||||||
| #ifdef GGML_USE_VULKAN | #ifdef GGML_USE_VULKAN | ||||||
|         register_backend(ggml_backend_vk_reg()); |         register_backend(ggml_backend_vk_reg()); | ||||||
| #endif | #endif | ||||||
|  | #ifdef GGML_USE_OPENCL | ||||||
|  |         register_backend(ggml_backend_opencl_reg()); | ||||||
|  | #endif | ||||||
| #ifdef GGML_USE_CANN | #ifdef GGML_USE_CANN | ||||||
|         register_backend(ggml_backend_cann_reg()); |         register_backend(ggml_backend_cann_reg()); | ||||||
| #endif | #endif | ||||||
| @@ -539,6 +546,7 @@ void ggml_backend_load_all_from_path(const char * dir_path) { | |||||||
|     ggml_backend_load_best("rpc", silent, dir_path); |     ggml_backend_load_best("rpc", silent, dir_path); | ||||||
|     ggml_backend_load_best("sycl", silent, dir_path); |     ggml_backend_load_best("sycl", silent, dir_path); | ||||||
|     ggml_backend_load_best("vulkan", silent, dir_path); |     ggml_backend_load_best("vulkan", silent, dir_path); | ||||||
|  |     ggml_backend_load_best("opencl", silent, dir_path); | ||||||
|     ggml_backend_load_best("musa", silent, dir_path); |     ggml_backend_load_best("musa", silent, dir_path); | ||||||
|     ggml_backend_load_best("cpu", silent, dir_path); |     ggml_backend_load_best("cpu", silent, dir_path); | ||||||
| } | } | ||||||
|   | |||||||
							
								
								
									
										147
									
								
								ggml/src/ggml-opencl/CMakeLists.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										147
									
								
								ggml/src/ggml-opencl/CMakeLists.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,147 @@ | |||||||
|  | find_package(OpenCL REQUIRED) | ||||||
|  | find_package(Python3 REQUIRED) | ||||||
|  |  | ||||||
|  | set(TARGET_NAME ggml-opencl) | ||||||
|  |  | ||||||
|  | ggml_add_backend_library(${TARGET_NAME} | ||||||
|  |                          ggml-opencl.cpp | ||||||
|  |                          ../../include/ggml-opencl.h) | ||||||
|  | target_link_libraries(${TARGET_NAME} PRIVATE ${OpenCL_LIBRARIES}) | ||||||
|  | target_include_directories(${TARGET_NAME} PRIVATE ${OpenCL_INCLUDE_DIRS}) | ||||||
|  |  | ||||||
|  | if (GGML_OPENCL_PROFILING) | ||||||
|  |     message(STATUS "OpenCL profiling enabled (increases CPU overhead)") | ||||||
|  |     add_compile_definitions(GGML_OPENCL_PROFILING) | ||||||
|  | endif () | ||||||
|  |  | ||||||
|  | add_compile_definitions(GGML_OPENCL_SOA_Q) | ||||||
|  |  | ||||||
|  | if (GGML_OPENCL_USE_ADRENO_KERNELS) | ||||||
|  |     message(STATUS "OpenCL will use matmul kernels optimized for Adreno") | ||||||
|  |     add_compile_definitions(GGML_OPENCL_USE_ADRENO_KERNELS) | ||||||
|  | endif () | ||||||
|  |  | ||||||
|  | if (GGML_OPENCL_EMBED_KERNELS) | ||||||
|  |     add_compile_definitions(GGML_OPENCL_EMBED_KERNELS) | ||||||
|  |  | ||||||
|  |     set(OPENCL_CL_SOURCE_EMBED         "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl.cl.h") | ||||||
|  |     set(OPENCL_MM_CL_SOURCE_EMBED      "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_mm.cl.h") | ||||||
|  |     set(OPENCL_CVT_CL_SOURCE_EMBED     "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_cvt.cl.h") | ||||||
|  |  | ||||||
|  |     set(OPENCL_GEMV_NOSHUFFLE_SOURCE_EMBED             "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_gemv_noshuffle.cl.h") | ||||||
|  |     set(OPENCL_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED     "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_gemv_noshuffle_general.cl.h") | ||||||
|  |     set(OPENCL_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED          "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_mul_mat_Ab_Bi_8x4.cl.h") | ||||||
|  |     set(OPENCL_TRANSPOSE_16_SOURCE_EMBED               "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_transpose_16.cl.h") | ||||||
|  |     set(OPENCL_TRANSPOSE_32_SOURCE_EMBED               "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_transpose_32.cl.h") | ||||||
|  |     set(OPENCL_TRANSPOSE_32_16_SOURCE_EMBED            "${CMAKE_BINARY_DIR}/autogenerated/ggml-opencl_transpose_32_16.cl.h") | ||||||
|  |  | ||||||
|  |     set(EMBED_KERNEL_SCRIPT             "${CMAKE_CURRENT_SOURCE_DIR}/kernels/embed_kernel.py") | ||||||
|  |     file(MAKE_DIRECTORY                 "${CMAKE_BINARY_DIR}/autogenerated") | ||||||
|  |  | ||||||
|  |     include_directories("${CMAKE_BINARY_DIR}/autogenerated") | ||||||
|  |  | ||||||
|  |     # Python must be accessible from command line | ||||||
|  |     add_custom_command( | ||||||
|  |         OUTPUT ${OPENCL_CL_SOURCE_EMBED} | ||||||
|  |         COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT} | ||||||
|  |             ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl.cl | ||||||
|  |             ${OPENCL_CL_SOURCE_EMBED} | ||||||
|  |         DEPENDS kernels/ggml-opencl.cl ${EMBED_KERNEL_SCRIPT} | ||||||
|  |         COMMENT "Generate ggml-opencl.cl.h" | ||||||
|  |     ) | ||||||
|  |  | ||||||
|  |     add_custom_command( | ||||||
|  |         OUTPUT ${OPENCL_MM_CL_SOURCE_EMBED} | ||||||
|  |         COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT} | ||||||
|  |             ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_mm.cl | ||||||
|  |             ${OPENCL_MM_CL_SOURCE_EMBED} | ||||||
|  |         DEPENDS kernels/ggml-opencl_mm.cl ${EMBED_KERNEL_SCRIPT} | ||||||
|  |         COMMENT "Generate ggml-opencl_mm.cl.h" | ||||||
|  |     ) | ||||||
|  |  | ||||||
|  |     add_custom_command( | ||||||
|  |         OUTPUT ${OPENCL_CVT_CL_SOURCE_EMBED} | ||||||
|  |         COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT} | ||||||
|  |             ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_cvt.cl | ||||||
|  |             ${OPENCL_CVT_CL_SOURCE_EMBED} | ||||||
|  |         DEPENDS kernels/ggml-opencl_cvt.cl ${EMBED_KERNEL_SCRIPT} | ||||||
|  |         COMMENT "Generate ggml-opencl_cvt.cl.h" | ||||||
|  |     ) | ||||||
|  |  | ||||||
|  |     add_custom_command( | ||||||
|  |         OUTPUT ${OPENCL_GEMV_NOSHUFFLE_SOURCE_EMBED} | ||||||
|  |         COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT} | ||||||
|  |             ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_gemv_noshuffle.cl | ||||||
|  |             ${OPENCL_GEMV_NOSHUFFLE_SOURCE_EMBED} | ||||||
|  |         DEPENDS kernels/ggml-opencl_gemv_noshuffle.cl ${EMBED_KERNEL_SCRIPT} | ||||||
|  |         COMMENT "Generate ggml-opencl_gemv_noshuffle.cl.h" | ||||||
|  |     ) | ||||||
|  |  | ||||||
|  |     add_custom_command( | ||||||
|  |         OUTPUT ${OPENCL_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED} | ||||||
|  |         COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT} | ||||||
|  |             ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_gemv_noshuffle_general.cl | ||||||
|  |             ${OPENCL_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED} | ||||||
|  |         DEPENDS kernels/ggml-opencl_gemv_noshuffle_general.cl ${EMBED_KERNEL_SCRIPT} | ||||||
|  |         COMMENT "Generate ggml-opencl_gemv_noshuffle_general.cl.h" | ||||||
|  |     ) | ||||||
|  |  | ||||||
|  |     add_custom_command( | ||||||
|  |         OUTPUT ${OPENCL_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED} | ||||||
|  |         COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT} | ||||||
|  |             ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl | ||||||
|  |             ${OPENCL_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED} | ||||||
|  |         DEPENDS kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl ${EMBED_KERNEL_SCRIPT} | ||||||
|  |         COMMENT "Generate ggml-opencl_mul_mat_Ab_Bi_8x4.cl.cl.h" | ||||||
|  |     ) | ||||||
|  |  | ||||||
|  |     add_custom_command( | ||||||
|  |         OUTPUT ${OPENCL_TRANSPOSE_16_SOURCE_EMBED} | ||||||
|  |         COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT} | ||||||
|  |             ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_transpose_16.cl | ||||||
|  |             ${OPENCL_TRANSPOSE_16_SOURCE_EMBED} | ||||||
|  |         DEPENDS kernels/ggml-opencl_transpose_16.cl ${EMBED_KERNEL_SCRIPT} | ||||||
|  |         COMMENT "Generate ggml-opencl_transpose_16.cl.h" | ||||||
|  |     ) | ||||||
|  |  | ||||||
|  |     add_custom_command( | ||||||
|  |         OUTPUT ${OPENCL_TRANSPOSE_32_SOURCE_EMBED} | ||||||
|  |         COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT} | ||||||
|  |             ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_transpose_32.cl | ||||||
|  |             ${OPENCL_TRANSPOSE_32_SOURCE_EMBED} | ||||||
|  |         DEPENDS kernels/ggml-opencl_transpose_32.cl ${EMBED_KERNEL_SCRIPT} | ||||||
|  |         COMMENT "Generate ggml-opencl_transpose_32.cl.h" | ||||||
|  |     ) | ||||||
|  |  | ||||||
|  |     add_custom_command( | ||||||
|  |         OUTPUT ${OPENCL_TRANSPOSE_32_16_SOURCE_EMBED} | ||||||
|  |         COMMAND ${Python3_EXECUTABLE} ${EMBED_KERNEL_SCRIPT} | ||||||
|  |             ${CMAKE_CURRENT_SOURCE_DIR}/kernels/ggml-opencl_transpose_32_16.cl | ||||||
|  |             ${OPENCL_TRANSPOSE_32_16_SOURCE_EMBED} | ||||||
|  |         DEPENDS kernels/ggml-opencl_transpose_32_16.cl ${EMBED_KERNEL_SCRIPT} | ||||||
|  |         COMMENT "Generate ggml-opencl_transpose_32_16.cl.h" | ||||||
|  |     ) | ||||||
|  |  | ||||||
|  |     target_sources(${TARGET_NAME} PRIVATE | ||||||
|  |                    ${OPENCL_CL_SOURCE_EMBED} | ||||||
|  |                    ${OPENCL_MM_CL_SOURCE_EMBED} | ||||||
|  |                    ${OPENCL_CVT_CL_SOURCE_EMBED} | ||||||
|  |                    ${OPENCL_GEMV_NOSHUFFLE_SOURCE_EMBED} | ||||||
|  |                    ${OPENCL_GEMV_NOSHUFFLE_GENERAL_SOURCE_EMBED} | ||||||
|  |                    ${OPENCL_MUL_MAT_Ab_Bi_8x4_SOURCE_EMBED} | ||||||
|  |                    ${OPENCL_TRANSPOSE_16_SOURCE_EMBED} | ||||||
|  |                    ${OPENCL_TRANSPOSE_32_SOURCE_EMBED} | ||||||
|  |                    ${OPENCL_TRANSPOSE_32_16_SOURCE_EMBED}) | ||||||
|  | else () | ||||||
|  |     # copy ggml-opencl.cl to bin directory | ||||||
|  |     configure_file(kernels/ggml-opencl.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl.cl COPYONLY) | ||||||
|  |     configure_file(kernels/ggml-opencl_mm.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_mm.cl COPYONLY) | ||||||
|  |     configure_file(kernels/ggml-opencl_cvt.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_cvt.cl COPYONLY) | ||||||
|  |  | ||||||
|  |     configure_file(kernels/ggml-opencl_gemv_noshuffle.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_gemv_noshuffle.cl COPYONLY) | ||||||
|  |     configure_file(kernels/ggml-opencl_gemv_noshuffle_general.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_gemv_noshuffle_general.cl COPYONLY) | ||||||
|  |     configure_file(kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_mul_mat_Ab_Bi_8x4.cl COPYONLY) | ||||||
|  |     configure_file(kernels/ggml-opencl_transpose_16.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_transpose_16.cl COPYONLY) | ||||||
|  |     configure_file(kernels/ggml-opencl_transpose_32.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_transpose_32.cl COPYONLY) | ||||||
|  |     configure_file(kernels/ggml-opencl_transpose_32_16.cl ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-opencl_transpose_32_16.cl COPYONLY) | ||||||
|  | endif () | ||||||
							
								
								
									
										4004
									
								
								ggml/src/ggml-opencl/ggml-opencl.cpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										4004
									
								
								ggml/src/ggml-opencl/ggml-opencl.cpp
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										26
									
								
								ggml/src/ggml-opencl/kernels/embed_kernel.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										26
									
								
								ggml/src/ggml-opencl/kernels/embed_kernel.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,26 @@ | |||||||
|  | # | ||||||
|  |  | ||||||
|  | import sys | ||||||
|  | import logging | ||||||
|  | logger = logging.getLogger("opencl-embed-kernel") | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def main(): | ||||||
|  |     logging.basicConfig(level=logging.INFO) | ||||||
|  |  | ||||||
|  |     if len(sys.argv) != 3: | ||||||
|  |         logger.info("Usage: python embed_kernel.py <input_file> <output_file>") | ||||||
|  |         sys.exit(1) | ||||||
|  |  | ||||||
|  |     ifile = open(sys.argv[1], "r") | ||||||
|  |     ofile = open(sys.argv[2], "w") | ||||||
|  |  | ||||||
|  |     for i in ifile: | ||||||
|  |         ofile.write('R"({})"\n'.format(i)) | ||||||
|  |  | ||||||
|  |     ifile.close() | ||||||
|  |     ofile.close() | ||||||
|  |  | ||||||
|  |  | ||||||
|  | if __name__ == "__main__": | ||||||
|  |     main() | ||||||
							
								
								
									
										2683
									
								
								ggml/src/ggml-opencl/kernels/ggml-opencl.cl
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										2683
									
								
								ggml/src/ggml-opencl/kernels/ggml-opencl.cl
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										106
									
								
								ggml/src/ggml-opencl/kernels/ggml-opencl_cvt.cl
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										106
									
								
								ggml/src/ggml-opencl/kernels/ggml-opencl_cvt.cl
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,106 @@ | |||||||
|  | //------------------------------------------------------------------------------ | ||||||
|  | // This file is contains additional kernels for data conversion. | ||||||
|  | // These kernels are used when loading the model, so its performance is less | ||||||
|  | // important. | ||||||
|  | //------------------------------------------------------------------------------ | ||||||
|  | #ifdef cl_khr_fp16 | ||||||
|  | #pragma OPENCL EXTENSION cl_khr_fp16 : enable | ||||||
|  | #elif defined(cl_amd_fp16) | ||||||
|  | #pragma OPENCL EXTENSION cl_amd_fp16 : enable | ||||||
|  | #else | ||||||
|  | #error "Half precision floating point not supportedby OpenCL implementation on your device." | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  | #ifdef cl_khr_subgroups | ||||||
|  | #pragma OPENCL EXTENSION cl_khr_subgroups : enable | ||||||
|  | #elif defined(cl_intel_subgroups) | ||||||
|  | #pragma OPENCL EXTENSION cl_intel_subgroups : enable | ||||||
|  | #else | ||||||
|  | #error "Subgroup not supported on your device." | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  | #ifdef cl_intel_required_subgroup_size | ||||||
|  | // Always use subgroup size of 32 on Intel. | ||||||
|  | #pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable | ||||||
|  | #define INTEL_GPU 1 | ||||||
|  | #define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16))) | ||||||
|  | #define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32))) | ||||||
|  | #elif defined(cl_qcom_reqd_sub_group_size) | ||||||
|  | // Always use subgroups size of 64 on Adreno. | ||||||
|  | #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable | ||||||
|  | #define ADRENO_GPU 1 | ||||||
|  | #define REQD_SUBGROUP_SIZE_64  __attribute__((qcom_reqd_sub_group_size("half"))) | ||||||
|  | #define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full"))) | ||||||
|  | #else | ||||||
|  | // TODO: do not know how to choose subgroup size on other GPUs. | ||||||
|  | #error "Selecting subgroup size is not supported on your device." | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  | #define QK4_0                   32 | ||||||
|  | #define QR4_0                   2 | ||||||
|  | #define QK4_1                   32 | ||||||
|  | #define QR4_1                   2 | ||||||
|  | #define QK5_0                   32 | ||||||
|  | #define QR5_0                   2 | ||||||
|  | #define QK5_1                   32 | ||||||
|  | #define QR5_1                   2 | ||||||
|  | #define QK8_0                   32 | ||||||
|  | #define QR8_0                   1 | ||||||
|  | #define QK_K                    256 | ||||||
|  | #define K_QUANTS_PER_ITERATION  2 | ||||||
|  |  | ||||||
|  | typedef char int8_t; | ||||||
|  | typedef uchar uint8_t; | ||||||
|  | typedef short int16_t; | ||||||
|  | typedef ushort uint16_t; | ||||||
|  | typedef int int32_t; | ||||||
|  | typedef uint uint32_t; | ||||||
|  |  | ||||||
|  | //------------------------------------------------------------------------------ | ||||||
|  | // block_q4_0 | ||||||
|  | //------------------------------------------------------------------------------ | ||||||
|  | struct block_q4_0 | ||||||
|  | { | ||||||
|  |     half d; | ||||||
|  |     uint8_t qs[QK4_0 / 2]; | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | //------------------------------------------------------------------------------ | ||||||
|  | // mul_vec_q_n_f32_flat_noshuffle | ||||||
|  | // | ||||||
|  | // This variation uses flat arrays (struct of arrays, SOA) representation for | ||||||
|  | // quant tensors. It also uses non shuffled bit order for weights. | ||||||
|  | // | ||||||
|  | // The shuffled version is kept in the original file because moving it here | ||||||
|  | // seems to result in worse performance for adreno. | ||||||
|  | //------------------------------------------------------------------------------ | ||||||
|  |  | ||||||
|  | kernel void kernel_convert_block_q4_0_noshuffle( | ||||||
|  |     global struct block_q4_0 * src0, | ||||||
|  |     global uchar * dst_q, | ||||||
|  |     global half  * dst_d | ||||||
|  | ) { | ||||||
|  |     global struct block_q4_0 * b = (global struct block_q4_0 *) src0 + get_global_id(0); | ||||||
|  |     global uchar * q = (global uchar *) dst_q + QK4_0/2*get_global_id(0); | ||||||
|  |     global half  * d = (global half *) dst_d + get_global_id(0); | ||||||
|  |  | ||||||
|  |     *d = b->d; | ||||||
|  |     for (int i = 0; i < QK4_0/4; ++i) { | ||||||
|  |         uchar x0 = b->qs[2*i + 0]; | ||||||
|  |         uchar x1 = b->qs[2*i + 1]; | ||||||
|  |  | ||||||
|  |         q[i + 0      ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4); | ||||||
|  |         q[i + QK4_0/4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0); | ||||||
|  |  | ||||||
|  | #ifdef ADRENO_GPU | ||||||
|  |         // Workaround for adreno - must have the following printf statement for | ||||||
|  |         // the kernel to work properly. Otherwise it produces incorrect result. | ||||||
|  |         // convert_uchar above also seems necessary. | ||||||
|  |         // Compare against a large number so that it does not print anything. | ||||||
|  |         // get_sub_group_local_id() also works. | ||||||
|  |         if (get_global_id(0) == 65536*4096) { | ||||||
|  |             printf("%04x - %02x\n", *(global ushort*)d, ((x0 & 0xF0) >> 4) | (x1 & 0xF0)); | ||||||
|  |         } | ||||||
|  | #endif | ||||||
|  |     } | ||||||
|  | } | ||||||
							
								
								
									
										265
									
								
								ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle.cl
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										265
									
								
								ggml/src/ggml-opencl/kernels/ggml-opencl_gemv_noshuffle.cl
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,265 @@ | |||||||
|  | #pragma OPENCL EXTENSION cl_khr_fp16 : enable | ||||||
|  | #pragma OPENCL EXTENSION cl_khr_subgroups : enable | ||||||
|  | #pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load: enable | ||||||
|  | #pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load: enable | ||||||
|  | #pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable | ||||||
|  | #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable | ||||||
|  |  | ||||||
|  | // assume | ||||||
|  | #define QK4_0 32 | ||||||
|  | #define N_SIMDGROUP 4 | ||||||
|  |  | ||||||
|  | #define dequantizeBlockAccum_ns_sgbroadcast_1_hi(total_sums, bits4, scale, y) \ | ||||||
|  |     float shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s0, 0); \ | ||||||
|  |     total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s1, 0); \ | ||||||
|  |     total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s2, 0); \ | ||||||
|  |     total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s3, 0); \ | ||||||
|  |     total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s4, 0); \ | ||||||
|  |     total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s5, 0); \ | ||||||
|  |     total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s6, 0); \ | ||||||
|  |     total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s7, 0); \ | ||||||
|  |     total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s0, 1); \ | ||||||
|  |     total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s1, 1); \ | ||||||
|  |     total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s2, 1); \ | ||||||
|  |     total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s3, 1); \ | ||||||
|  |     total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s4, 1); \ | ||||||
|  |     total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s5, 1); \ | ||||||
|  |     total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s6, 1); \ | ||||||
|  |     total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s7, 1); \ | ||||||
|  |     total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \ | ||||||
|  |  | ||||||
|  |  | ||||||
|  | #define dequantizeBlockAccum_ns_sgbroadcast_1_lo(total_sums, bits4, scale, y) \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s0, 2); \ | ||||||
|  |     total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s1, 2); \ | ||||||
|  |     total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s2, 2); \ | ||||||
|  |     total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s3, 2); \ | ||||||
|  |     total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s4, 2); \ | ||||||
|  |     total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s5, 2); \ | ||||||
|  |     total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s6, 2); \ | ||||||
|  |     total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s7, 2); \ | ||||||
|  |     total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s0, 3); \ | ||||||
|  |     total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s1, 3); \ | ||||||
|  |     total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s2, 3); \ | ||||||
|  |     total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s3, 3); \ | ||||||
|  |     total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s4, 3); \ | ||||||
|  |     total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s5, 3); \ | ||||||
|  |     total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s6, 3); \ | ||||||
|  |     total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s7, 3); \ | ||||||
|  |     total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \ | ||||||
|  |  | ||||||
|  |  | ||||||
|  | #define dequantizeBlockAccum_ns_sgbroadcast_8_hi(total_sums, bits4, scale, y) \ | ||||||
|  |     float8 shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y, 0); \ | ||||||
|  |     total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \ | ||||||
|  |     total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \ | ||||||
|  |     total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \ | ||||||
|  |     total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \ | ||||||
|  |     total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \ | ||||||
|  |     total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \ | ||||||
|  |     total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \ | ||||||
|  |     total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \ | ||||||
|  |     total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \ | ||||||
|  |     total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \ | ||||||
|  |     total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \ | ||||||
|  |     total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \ | ||||||
|  |     total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \ | ||||||
|  |     total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \ | ||||||
|  |     total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \ | ||||||
|  |     total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \ | ||||||
|  |     shared_y = sub_group_broadcast(y, 1); \ | ||||||
|  |     total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \ | ||||||
|  |     total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \ | ||||||
|  |     total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \ | ||||||
|  |     total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \ | ||||||
|  |     total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \ | ||||||
|  |     total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \ | ||||||
|  |     total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \ | ||||||
|  |     total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \ | ||||||
|  |     total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \ | ||||||
|  |     total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \ | ||||||
|  |     total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \ | ||||||
|  |     total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \ | ||||||
|  |     total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \ | ||||||
|  |     total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \ | ||||||
|  |     total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \ | ||||||
|  |     total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \ | ||||||
|  |  | ||||||
|  |  | ||||||
|  | #define dequantizeBlockAccum_ns_sgbroadcast_8_lo(total_sums, bits4, scale, y) \ | ||||||
|  |     shared_y = sub_group_broadcast(y, 2); \ | ||||||
|  |     total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \ | ||||||
|  |     total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \ | ||||||
|  |     total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \ | ||||||
|  |     total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \ | ||||||
|  |     total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \ | ||||||
|  |     total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \ | ||||||
|  |     total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \ | ||||||
|  |     total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \ | ||||||
|  |     total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \ | ||||||
|  |     total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \ | ||||||
|  |     total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \ | ||||||
|  |     total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \ | ||||||
|  |     total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \ | ||||||
|  |     total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \ | ||||||
|  |     total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \ | ||||||
|  |     total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \ | ||||||
|  |     shared_y = sub_group_broadcast(y, 3); \ | ||||||
|  |     total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \ | ||||||
|  |     total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \ | ||||||
|  |     total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \ | ||||||
|  |     total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \ | ||||||
|  |     total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \ | ||||||
|  |     total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \ | ||||||
|  |     total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \ | ||||||
|  |     total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \ | ||||||
|  |     total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \ | ||||||
|  |     total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \ | ||||||
|  |     total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \ | ||||||
|  |     total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \ | ||||||
|  |     total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \ | ||||||
|  |     total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \ | ||||||
|  |     total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \ | ||||||
|  |     total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \ | ||||||
|  |  | ||||||
|  |  | ||||||
|  | __attribute__((qcom_reqd_sub_group_size("full"))) | ||||||
|  | __kernel void kernel_gemv_noshuffle( | ||||||
|  |         __read_only  image1d_buffer_t src0_q,  // quantized A | ||||||
|  |         global half2  * src0_d,  // A scales | ||||||
|  |         __read_only  image1d_buffer_t src1,    // B | ||||||
|  |         ulong offset1,            // offset to B (0) | ||||||
|  |         global float * dst,     // C | ||||||
|  |         ulong offsetd,            // offset to C (0) | ||||||
|  |         uint K,               // K | ||||||
|  |         int ne01,               // M | ||||||
|  |         int ne02,               // 1 | ||||||
|  |         int ne10,               // K | ||||||
|  |         int ne12,               // 1 | ||||||
|  |         int ne0,                // M | ||||||
|  |         int ne1,                // N | ||||||
|  |         int r2,                 // 1 | ||||||
|  |         int r3) | ||||||
|  | { | ||||||
|  |     uint groupId = get_local_id(1); | ||||||
|  |     uint gid     = get_global_id(0); | ||||||
|  |     ushort slid    = get_sub_group_local_id(); | ||||||
|  |  | ||||||
|  |     __private uint4     regA; | ||||||
|  |     __private half2     regS; | ||||||
|  |     __private float8    regB; | ||||||
|  |  | ||||||
|  |     __private float2 totalSum = (float2)(0.0f); | ||||||
|  |  | ||||||
|  |     // loop along K in block granularity, skip 4 blocks every iter | ||||||
|  |     for (uint k = groupId; k < (K / QK4_0); k += N_SIMDGROUP) { | ||||||
|  |         regS = src0_d[gid + k * LINE_STRIDE_A]; // each fiber loads scale of two rows | ||||||
|  |         // first 4 fibers in each wave load 8 B values to its private scope | ||||||
|  |         if (slid < 4) { | ||||||
|  |             regB.s0123 = read_imagef(src1, (slid * 2 + k * 8)); | ||||||
|  |             regB.s4567 = read_imagef(src1, (1 + slid * 2 + k * 8)); | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         // load half weights for two blocks in consecutive rows | ||||||
|  |         regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 0)).x; | ||||||
|  |         regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 1)).x; | ||||||
|  |         regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 2)).x; | ||||||
|  |         regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 3)).x; | ||||||
|  | #ifdef VECTOR_SUB_GROUP_BROADCAT | ||||||
|  |         dequantizeBlockAccum_ns_sgbroadcast_8_hi(totalSum, as_ushort8(regA), regS, regB); | ||||||
|  | #else | ||||||
|  |         dequantizeBlockAccum_ns_sgbroadcast_1_hi(totalSum, as_ushort8(regA), regS, regB); | ||||||
|  | #endif // VECTOR_SUB_GROUP_BROADCAT | ||||||
|  |  | ||||||
|  |         regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 4)).x; | ||||||
|  |         regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 5)).x; | ||||||
|  |         regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 6)).x; | ||||||
|  |         regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 7)).x; | ||||||
|  | #ifdef VECTOR_SUB_GROUP_BROADCAT | ||||||
|  |         dequantizeBlockAccum_ns_sgbroadcast_8_lo(totalSum, as_ushort8(regA), regS, regB); | ||||||
|  | #else | ||||||
|  |         dequantizeBlockAccum_ns_sgbroadcast_1_lo(totalSum, as_ushort8(regA), regS, regB); | ||||||
|  | #endif // VECTOR_SUB_GROUP_BROADCAT | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     // reduction in local memory, assumes #wave=4 | ||||||
|  |     __local float2 reduceLM[SIMDGROUP_WIDTH * 3]; | ||||||
|  |     if (groupId == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = totalSum; | ||||||
|  |     if (groupId == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = totalSum; | ||||||
|  |     if (groupId == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = totalSum; | ||||||
|  |     barrier(CLK_LOCAL_MEM_FENCE); | ||||||
|  |     if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 0 + slid]; | ||||||
|  |     if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 1 + slid]; | ||||||
|  |     if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 2 + slid]; | ||||||
|  |  | ||||||
|  |     // 2 outputs per fiber in wave 0 | ||||||
|  |     if (groupId == 0) { | ||||||
|  |         dst = (global float*)((global char*)dst + offsetd); | ||||||
|  |         vstore2(totalSum, 0, &(dst[gid * 2])); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  | } | ||||||
| @@ -0,0 +1,271 @@ | |||||||
|  | #pragma OPENCL EXTENSION cl_khr_fp16 : enable | ||||||
|  | #pragma OPENCL EXTENSION cl_khr_subgroups : enable | ||||||
|  | #pragma OPENCL EXTENSION cl_qcom_subgroup_uniform_load: enable | ||||||
|  | #pragma OPENCL EXTENSION cl_qcom_subgroup_constant_load: enable | ||||||
|  | #pragma OPENCL EXTENSION cl_qcom_extra_vector_types : enable | ||||||
|  | #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable | ||||||
|  |  | ||||||
|  | // assume | ||||||
|  | #define QK4_0 32 | ||||||
|  | #define N_SIMDGROUP 4 | ||||||
|  |  | ||||||
|  | #define dequantizeBlockAccum_ns_sgbroadcast_1_hi(total_sums, bits4, scale, y) \ | ||||||
|  |     float shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s0, 0); \ | ||||||
|  |     total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s1, 0); \ | ||||||
|  |     total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s2, 0); \ | ||||||
|  |     total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s3, 0); \ | ||||||
|  |     total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s4, 0); \ | ||||||
|  |     total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s5, 0); \ | ||||||
|  |     total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s6, 0); \ | ||||||
|  |     total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s7, 0); \ | ||||||
|  |     total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s0, 1); \ | ||||||
|  |     total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s1, 1); \ | ||||||
|  |     total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s2, 1); \ | ||||||
|  |     total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s3, 1); \ | ||||||
|  |     total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s4, 1); \ | ||||||
|  |     total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s5, 1); \ | ||||||
|  |     total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s6, 1); \ | ||||||
|  |     total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s7, 1); \ | ||||||
|  |     total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \ | ||||||
|  |  | ||||||
|  |  | ||||||
|  | #define dequantizeBlockAccum_ns_sgbroadcast_1_lo(total_sums, bits4, scale, y) \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s0, 2); \ | ||||||
|  |     total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s1, 2); \ | ||||||
|  |     total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s2, 2); \ | ||||||
|  |     total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s3, 2); \ | ||||||
|  |     total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s4, 2); \ | ||||||
|  |     total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s5, 2); \ | ||||||
|  |     total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s6, 2); \ | ||||||
|  |     total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s7, 2); \ | ||||||
|  |     total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s0, 3); \ | ||||||
|  |     total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s1, 3); \ | ||||||
|  |     total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s2, 3); \ | ||||||
|  |     total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s3, 3); \ | ||||||
|  |     total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s4, 3); \ | ||||||
|  |     total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s5, 3); \ | ||||||
|  |     total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s6, 3); \ | ||||||
|  |     total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y.s7, 3); \ | ||||||
|  |     total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y; \ | ||||||
|  |     total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y; \ | ||||||
|  |  | ||||||
|  |  | ||||||
|  | #define dequantizeBlockAccum_ns_sgbroadcast_8_hi(total_sums, bits4, scale, y) \ | ||||||
|  |     float8 shared_y; \ | ||||||
|  |     shared_y = sub_group_broadcast(y, 0); \ | ||||||
|  |     total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \ | ||||||
|  |     total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \ | ||||||
|  |     total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \ | ||||||
|  |     total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \ | ||||||
|  |     total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \ | ||||||
|  |     total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \ | ||||||
|  |     total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \ | ||||||
|  |     total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \ | ||||||
|  |     total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \ | ||||||
|  |     total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \ | ||||||
|  |     total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \ | ||||||
|  |     total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \ | ||||||
|  |     total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \ | ||||||
|  |     total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \ | ||||||
|  |     total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \ | ||||||
|  |     total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \ | ||||||
|  |     shared_y = sub_group_broadcast(y, 1); \ | ||||||
|  |     total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \ | ||||||
|  |     total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \ | ||||||
|  |     total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \ | ||||||
|  |     total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \ | ||||||
|  |     total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \ | ||||||
|  |     total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \ | ||||||
|  |     total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \ | ||||||
|  |     total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \ | ||||||
|  |     total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \ | ||||||
|  |     total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \ | ||||||
|  |     total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \ | ||||||
|  |     total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \ | ||||||
|  |     total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \ | ||||||
|  |     total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \ | ||||||
|  |     total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \ | ||||||
|  |     total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \ | ||||||
|  |  | ||||||
|  |  | ||||||
|  | #define dequantizeBlockAccum_ns_sgbroadcast_8_lo(total_sums, bits4, scale, y) \ | ||||||
|  |     shared_y = sub_group_broadcast(y, 2); \ | ||||||
|  |     total_sums.s0 += ((bits4.s0 & 0x000F) - 8) * scale.s0 * shared_y.s0; \ | ||||||
|  |     total_sums.s0 += (((bits4.s0 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \ | ||||||
|  |     total_sums.s0 += (((bits4.s0 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \ | ||||||
|  |     total_sums.s0 += (((bits4.s0 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \ | ||||||
|  |     total_sums.s0 += ((bits4.s2 & 0x000F) - 8) * scale.s0 * shared_y.s4; \ | ||||||
|  |     total_sums.s0 += (((bits4.s2 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \ | ||||||
|  |     total_sums.s0 += (((bits4.s2 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \ | ||||||
|  |     total_sums.s0 += (((bits4.s2 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \ | ||||||
|  |     total_sums.s1 += ((bits4.s1 & 0x000F) - 8) * scale.s1 * shared_y.s0; \ | ||||||
|  |     total_sums.s1 += (((bits4.s1 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \ | ||||||
|  |     total_sums.s1 += (((bits4.s1 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \ | ||||||
|  |     total_sums.s1 += (((bits4.s1 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \ | ||||||
|  |     total_sums.s1 += ((bits4.s3 & 0x000F) - 8) * scale.s1 * shared_y.s4; \ | ||||||
|  |     total_sums.s1 += (((bits4.s3 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \ | ||||||
|  |     total_sums.s1 += (((bits4.s3 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \ | ||||||
|  |     total_sums.s1 += (((bits4.s3 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \ | ||||||
|  |     shared_y = sub_group_broadcast(y, 3); \ | ||||||
|  |     total_sums.s0 += ((bits4.s4 & 0x000F) - 8) * scale.s0 * shared_y.s0; \ | ||||||
|  |     total_sums.s0 += (((bits4.s4 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s1; \ | ||||||
|  |     total_sums.s0 += (((bits4.s4 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s2; \ | ||||||
|  |     total_sums.s0 += (((bits4.s4 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s3; \ | ||||||
|  |     total_sums.s0 += ((bits4.s6 & 0x000F) - 8) * scale.s0 * shared_y.s4; \ | ||||||
|  |     total_sums.s0 += (((bits4.s6 & 0x00F0) >> 4) - 8) * scale.s0 * shared_y.s5; \ | ||||||
|  |     total_sums.s0 += (((bits4.s6 & 0x0F00) >> 8) - 8) * scale.s0 * shared_y.s6; \ | ||||||
|  |     total_sums.s0 += (((bits4.s6 & 0xF000) >> 12) - 8) * scale.s0 * shared_y.s7; \ | ||||||
|  |     total_sums.s1 += ((bits4.s5 & 0x000F) - 8) * scale.s1 * shared_y.s0; \ | ||||||
|  |     total_sums.s1 += (((bits4.s5 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s1; \ | ||||||
|  |     total_sums.s1 += (((bits4.s5 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s2; \ | ||||||
|  |     total_sums.s1 += (((bits4.s5 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s3; \ | ||||||
|  |     total_sums.s1 += ((bits4.s7 & 0x000F) - 8) * scale.s1 * shared_y.s4; \ | ||||||
|  |     total_sums.s1 += (((bits4.s7 & 0x00F0) >> 4) - 8) * scale.s1 * shared_y.s5; \ | ||||||
|  |     total_sums.s1 += (((bits4.s7 & 0x0F00) >> 8) - 8) * scale.s1 * shared_y.s6; \ | ||||||
|  |     total_sums.s1 += (((bits4.s7 & 0xF000) >> 12) - 8) * scale.s1 * shared_y.s7; \ | ||||||
|  |  | ||||||
|  |  | ||||||
|  | __attribute__((qcom_reqd_sub_group_size("full"))) | ||||||
|  | __kernel void kernel_gemv_noshuffle( | ||||||
|  |         __read_only  image1d_buffer_t src0_q,  // quantized A | ||||||
|  |         global half2  * src0_d,  // A scales | ||||||
|  |         __read_only  image1d_buffer_t src1,    // B | ||||||
|  |         ulong offset1,            // offset to B (0) | ||||||
|  |         global float * dst,     // C | ||||||
|  |         ulong offsetd,            // offset to C (0) | ||||||
|  |         int ne00,               // K | ||||||
|  |         int ne01,               // M | ||||||
|  |         int ne02,               // 1 | ||||||
|  |         int ne10,               // K | ||||||
|  |         int ne12,               // 1 | ||||||
|  |         int ne0,                // M | ||||||
|  |         int ne1,                // N | ||||||
|  |         int r2,                 // 1 | ||||||
|  |         int r3) | ||||||
|  | { | ||||||
|  |     uint groupId = get_local_id(1); | ||||||
|  |     uint gid     = get_global_id(0); | ||||||
|  |     ushort slid    = get_sub_group_local_id(); | ||||||
|  |  | ||||||
|  |     uint K = ne00; | ||||||
|  |     uint M = ne01; | ||||||
|  |  | ||||||
|  |     uint LINE_STRIDE_A = M / 2; | ||||||
|  |     uint BLOCK_STRIDE_A = N_SIMDGROUP * M; | ||||||
|  |  | ||||||
|  |     __private uint4     regA; | ||||||
|  |     __private half2     regS; | ||||||
|  |     __private float8    regB; | ||||||
|  |  | ||||||
|  |     __private float2 totalSum = (float2)(0.0f); | ||||||
|  |  | ||||||
|  |     // loop along K in block granularity, skip 4 blocks every iter | ||||||
|  |     for (uint k = groupId; k < (K / QK4_0); k += N_SIMDGROUP) { | ||||||
|  |         regS = src0_d[gid + k * LINE_STRIDE_A]; // each fiber loads scale of two rows | ||||||
|  |         // first 4 fibers in each wave load 8 B values to its private scope | ||||||
|  |         if (slid < 4) { | ||||||
|  |             regB.s0123 = read_imagef(src1, (slid * 2 + k * 8)); | ||||||
|  |             regB.s4567 = read_imagef(src1, (1 + slid * 2 + k * 8)); | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         // load half weights for two blocks in consecutive rows | ||||||
|  |         regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 0)).x; | ||||||
|  |         regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 1)).x; | ||||||
|  |         regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 2)).x; | ||||||
|  |         regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 3)).x; | ||||||
|  | #ifdef VECTOR_SUB_GROUP_BROADCAT | ||||||
|  |         dequantizeBlockAccum_ns_sgbroadcast_8_hi(totalSum, as_ushort8(regA), regS, regB); | ||||||
|  | #else | ||||||
|  |         dequantizeBlockAccum_ns_sgbroadcast_1_hi(totalSum, as_ushort8(regA), regS, regB); | ||||||
|  | #endif // VECTOR_SUB_GROUP_BROADCAT | ||||||
|  |  | ||||||
|  |         regA.s0 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 4)).x; | ||||||
|  |         regA.s1 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 5)).x; | ||||||
|  |         regA.s2 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 6)).x; | ||||||
|  |         regA.s3 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 7)).x; | ||||||
|  | #ifdef VECTOR_SUB_GROUP_BROADCAT | ||||||
|  |         dequantizeBlockAccum_ns_sgbroadcast_8_lo(totalSum, as_ushort8(regA), regS, regB); | ||||||
|  | #else | ||||||
|  |         dequantizeBlockAccum_ns_sgbroadcast_1_lo(totalSum, as_ushort8(regA), regS, regB); | ||||||
|  | #endif // VECTOR_SUB_GROUP_BROADCAT | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     // reduction in local memory, assumes #wave=4 | ||||||
|  |     __local float2 reduceLM[SIMDGROUP_WIDTH * 3]; | ||||||
|  |     if (groupId == 1) reduceLM[SIMDGROUP_WIDTH * 0 + slid] = totalSum; | ||||||
|  |     if (groupId == 2) reduceLM[SIMDGROUP_WIDTH * 1 + slid] = totalSum; | ||||||
|  |     if (groupId == 3) reduceLM[SIMDGROUP_WIDTH * 2 + slid] = totalSum; | ||||||
|  |     barrier(CLK_LOCAL_MEM_FENCE); | ||||||
|  |     if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 0 + slid]; | ||||||
|  |     if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 1 + slid]; | ||||||
|  |     if (groupId == 0) totalSum += reduceLM[SIMDGROUP_WIDTH * 2 + slid]; | ||||||
|  |  | ||||||
|  |     // 2 outputs per fiber in wave 0 | ||||||
|  |     if (groupId == 0) { | ||||||
|  |         dst = (global float*)((global char*)dst + offsetd); | ||||||
|  |         vstore2(totalSum, 0, &(dst[gid * 2])); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  | } | ||||||
							
								
								
									
										1225
									
								
								ggml/src/ggml-opencl/kernels/ggml-opencl_mm.cl
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1225
									
								
								ggml/src/ggml-opencl/kernels/ggml-opencl_mm.cl
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										130
									
								
								ggml/src/ggml-opencl/kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										130
									
								
								ggml/src/ggml-opencl/kernels/ggml-opencl_mul_mat_Ab_Bi_8x4.cl
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,130 @@ | |||||||
|  | // src0_q, src0_d, src1 are transposed as a preprocessing step | ||||||
|  | // 4-bit weights are transposed in groups of 4 (unsigned short int) | ||||||
|  | // consider weights originally "next to each other", now "on top of each other" | ||||||
|  | // each fiber computes a 8x4 tile of output elements | ||||||
|  | // using unshuffled weights | ||||||
|  |  | ||||||
|  | #pragma OPENCL EXTENSION cl_khr_fp16 : enable | ||||||
|  | #pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable | ||||||
|  |  | ||||||
|  | __attribute__((qcom_reqd_sub_group_size("full"))) | ||||||
|  | kernel void kernel_mul_mat_Ab_Bi_8x4( | ||||||
|  |         global const ushort * src0_q,       // quantized A | ||||||
|  |         global const half  * src0_d,        // A scales | ||||||
|  |         __read_only image1d_buffer_t src1,  // B (1d image) | ||||||
|  |         global float * dst,                 // C | ||||||
|  |         int m,                              // M | ||||||
|  |         int n,                              // N with padding | ||||||
|  |         int k,                              // K | ||||||
|  |         int n_no_padding                    // N without padding | ||||||
|  | ) { | ||||||
|  |  | ||||||
|  |     int m_4 = m >> 2; | ||||||
|  |     int n_4 = n >> 2; | ||||||
|  |  | ||||||
|  |     int gy = get_global_id(0); | ||||||
|  |     int gx = get_global_id(1); | ||||||
|  |     int gx_2 = gx << 2; | ||||||
|  |  | ||||||
|  |     half8 c0 = 0, c1 = 0, c2 = 0, c3 = 0; // 8x4 output elements | ||||||
|  |     half8 B; // registers for activations | ||||||
|  |     half4 dequantized_weights; // registers for dequantized weights | ||||||
|  |     __global const ushort* weight_ptr = src0_q + gx_2; // pointer for weights | ||||||
|  |     __global const half* scale_ptr = src0_d + gx_2; // pointer for scales | ||||||
|  |  | ||||||
|  |     for(int i=0; i<k; i+=4){ //loop through K dimension | ||||||
|  |  | ||||||
|  |         B.s0123 = read_imageh(src1, gy*2 + (i)*(n_4)); | ||||||
|  |         B.s4567 = read_imageh(src1, gy*2 + (i)*(n_4)+1); | ||||||
|  |  | ||||||
|  |         // keep (i/4) and (i/32) in parenthesis, rounds down | ||||||
|  |         // load 4 consecutive groups of 4 weights | ||||||
|  |         ushort4 bits4 = vload4(0, weight_ptr + (i/4)*(m)); // (i/4) because weights grouped in 4s | ||||||
|  |  | ||||||
|  |         // load 4 consecutive scales | ||||||
|  |         half4 scale = vload4(0, scale_ptr + (i/32)*(m));// (i/32) because 1 scale per 32 elements | ||||||
|  |  | ||||||
|  |         // j=0 | ||||||
|  |         dequantized_weights.s0 = ((bits4.s0 & (0x000F)) - 8) * scale.s0; // dequantize a row of the 16 weights | ||||||
|  |         dequantized_weights.s1 = ((bits4.s1 & (0x000F)) - 8) * scale.s1; | ||||||
|  |         dequantized_weights.s2 = ((bits4.s2 & (0x000F)) - 8) * scale.s2; | ||||||
|  |         dequantized_weights.s3 = ((bits4.s3 & (0x000F)) - 8) * scale.s3; | ||||||
|  |         c0 += B * dequantized_weights.s0; // vector-scalar multiplication to accumulate | ||||||
|  |         c1 += B * dequantized_weights.s1; | ||||||
|  |         c2 += B * dequantized_weights.s2; | ||||||
|  |         c3 += B * dequantized_weights.s3; | ||||||
|  |  | ||||||
|  |         // j=1 | ||||||
|  |         B.s0123 = read_imageh(src1, gy*2 + (i+1)*(n_4)); | ||||||
|  |         B.s4567 = read_imageh(src1, gy*2 + (i+1)*(n_4)+1); | ||||||
|  |         dequantized_weights.s0 = (((bits4.s0 & (0x00F0)) >> 4) - 8) * scale.s0; // dequantize a row of the 16 weights | ||||||
|  |         dequantized_weights.s1 = (((bits4.s1 & (0x00F0)) >> 4) - 8) * scale.s1; | ||||||
|  |         dequantized_weights.s2 = (((bits4.s2 & (0x00F0)) >> 4) - 8) * scale.s2; | ||||||
|  |         dequantized_weights.s3 = (((bits4.s3 & (0x00F0)) >> 4) - 8) * scale.s3; | ||||||
|  |         c0 += B * dequantized_weights.s0; //vector-scalar multiplication to accumulate | ||||||
|  |         c1 += B * dequantized_weights.s1; | ||||||
|  |         c2 += B * dequantized_weights.s2; | ||||||
|  |         c3 += B * dequantized_weights.s3; | ||||||
|  |  | ||||||
|  |         // j=2 | ||||||
|  |         B.s0123 = read_imageh(src1, gy*2 + (i+2)*(n_4)); | ||||||
|  |         B.s4567 = read_imageh(src1, gy*2 + (i+2)*(n_4)+1); | ||||||
|  |         dequantized_weights.s0 = (((bits4.s0 & (0x0F00)) >> 8) - 8) * scale.s0; // dequantize a row of the 16 weights | ||||||
|  |         dequantized_weights.s1 = (((bits4.s1 & (0x0F00)) >> 8) - 8) * scale.s1; | ||||||
|  |         dequantized_weights.s2 = (((bits4.s2 & (0x0F00)) >> 8) - 8) * scale.s2; | ||||||
|  |         dequantized_weights.s3 = (((bits4.s3 & (0x0F00)) >> 8) - 8) * scale.s3; | ||||||
|  |         c0 += B * dequantized_weights.s0; // vector-scalar multiplication to accumulate | ||||||
|  |         c1 += B * dequantized_weights.s1; | ||||||
|  |         c2 += B * dequantized_weights.s2; | ||||||
|  |         c3 += B * dequantized_weights.s3; | ||||||
|  |  | ||||||
|  |         // j=3 | ||||||
|  |         B.s0123 = read_imageh(src1, gy*2 + (i+3)*(n_4)); | ||||||
|  |         B.s4567 = read_imageh(src1, gy*2 + (i+3)*(n_4)+1); | ||||||
|  |         dequantized_weights.s0 = (((bits4.s0 & (0xF000)) >> 12) - 8) * scale.s0; // dequantize a row of the 16 weights | ||||||
|  |         dequantized_weights.s1 = (((bits4.s1 & (0xF000)) >> 12) - 8) * scale.s1; | ||||||
|  |         dequantized_weights.s2 = (((bits4.s2 & (0xF000)) >> 12) - 8) * scale.s2; | ||||||
|  |         dequantized_weights.s3 = (((bits4.s3 & (0xF000)) >> 12) - 8) * scale.s3; | ||||||
|  |         c0 += B * dequantized_weights.s0; // vector-scalar multiplication to accumulate | ||||||
|  |         c1 += B * dequantized_weights.s1; | ||||||
|  |         c2 += B * dequantized_weights.s2; | ||||||
|  |         c3 += B * dequantized_weights.s3; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     int idx = (gy<<3)*m + (gx<<2); // vectorized store 16 elements | ||||||
|  |  | ||||||
|  |     // conditional check if store is to a valid location. Required when N is not a multiple of 8 | ||||||
|  |     // if statements allow registers to be reused for each store | ||||||
|  |     // provides a performance boost due to reduced register footprint, which increases number of concurrent waves | ||||||
|  |     if(idx+3 < m*n_no_padding){ | ||||||
|  |         vstore4((float4)(c0.s0, c1.s0, c2.s0, c3.s0), 0, dst + idx); | ||||||
|  |         idx += m; | ||||||
|  |     } | ||||||
|  |     if(idx+3 < m*n_no_padding){ | ||||||
|  |         vstore4((float4)(c0.s1, c1.s1, c2.s1, c3.s1), 0, dst + idx); | ||||||
|  |         idx += m; | ||||||
|  |     } | ||||||
|  |     if(idx+3 < m*n_no_padding){ | ||||||
|  |         vstore4((float4)(c0.s2, c1.s2, c2.s2, c3.s2), 0, dst + idx); | ||||||
|  |         idx += m; | ||||||
|  |     } | ||||||
|  |     if(idx+3 < m*n_no_padding){ | ||||||
|  |         vstore4((float4)(c0.s3, c1.s3, c2.s3, c3.s3), 0, dst + idx); | ||||||
|  |         idx += m; | ||||||
|  |     } | ||||||
|  |     if(idx+3 < m*n_no_padding){ | ||||||
|  |         vstore4((float4)(c0.s4, c1.s4, c2.s4, c3.s4), 0, dst + idx); | ||||||
|  |         idx += m; | ||||||
|  |     } | ||||||
|  |     if(idx+3 < m*n_no_padding){ | ||||||
|  |         vstore4((float4)(c0.s5, c1.s5, c2.s5, c3.s5), 0, dst + idx); | ||||||
|  |         idx += m; | ||||||
|  |     } | ||||||
|  |     if(idx+3 < m*n_no_padding){ | ||||||
|  |         vstore4((float4)(c0.s6, c1.s6, c2.s6, c3.s6), 0, dst + idx); | ||||||
|  |         idx += m; | ||||||
|  |     } | ||||||
|  |     if(idx+3 < m*n_no_padding){ | ||||||
|  |         vstore4((float4)(c0.s7, c1.s7, c2.s7, c3.s7), 0, dst + idx); | ||||||
|  |     } | ||||||
|  | } | ||||||
							
								
								
									
										32
									
								
								ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_16.cl
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										32
									
								
								ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_16.cl
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,32 @@ | |||||||
|  | // 16-bit transpose, loading/storing an 8x8 tile of elements | ||||||
|  |  | ||||||
|  | kernel void kernel_transpose_16( | ||||||
|  |     __read_only image1d_buffer_t input, | ||||||
|  |     __write_only image1d_buffer_t output, | ||||||
|  |     const uint rows, | ||||||
|  |     const uint cols | ||||||
|  | ) { | ||||||
|  |  | ||||||
|  |     const int i = get_global_id(0); | ||||||
|  |     const int j = get_global_id(1); | ||||||
|  |     const int i_3 = i<<3; | ||||||
|  |     const int j_3 = j<<3; | ||||||
|  |  | ||||||
|  |     ushort8 temp0 = as_ushort8(read_imagef(input, (j_3+0)*cols+i)); | ||||||
|  |     ushort8 temp1 = as_ushort8(read_imagef(input, (j_3+1)*cols+i)); | ||||||
|  |     ushort8 temp2 = as_ushort8(read_imagef(input, (j_3+2)*cols+i)); | ||||||
|  |     ushort8 temp3 = as_ushort8(read_imagef(input, (j_3+3)*cols+i)); | ||||||
|  |     ushort8 temp4 = as_ushort8(read_imagef(input, (j_3+4)*cols+i)); | ||||||
|  |     ushort8 temp5 = as_ushort8(read_imagef(input, (j_3+5)*cols+i)); | ||||||
|  |     ushort8 temp6 = as_ushort8(read_imagef(input, (j_3+6)*cols+i)); | ||||||
|  |     ushort8 temp7 = as_ushort8(read_imagef(input, (j_3+7)*cols+i)); | ||||||
|  |  | ||||||
|  |     write_imagef(output, (i_3+0)*rows+j, as_float4((ushort8)(temp0.s0, temp1.s0, temp2.s0, temp3.s0, temp4.s0, temp5.s0, temp6.s0, temp7.s0))); | ||||||
|  |     write_imagef(output, (i_3+1)*rows+j, as_float4((ushort8)(temp0.s1, temp1.s1, temp2.s1, temp3.s1, temp4.s1, temp5.s1, temp6.s1, temp7.s1))); | ||||||
|  |     write_imagef(output, (i_3+2)*rows+j, as_float4((ushort8)(temp0.s2, temp1.s2, temp2.s2, temp3.s2, temp4.s2, temp5.s2, temp6.s2, temp7.s2))); | ||||||
|  |     write_imagef(output, (i_3+3)*rows+j, as_float4((ushort8)(temp0.s3, temp1.s3, temp2.s3, temp3.s3, temp4.s3, temp5.s3, temp6.s3, temp7.s3))); | ||||||
|  |     write_imagef(output, (i_3+4)*rows+j, as_float4((ushort8)(temp0.s4, temp1.s4, temp2.s4, temp3.s4, temp4.s4, temp5.s4, temp6.s4, temp7.s4))); | ||||||
|  |     write_imagef(output, (i_3+5)*rows+j, as_float4((ushort8)(temp0.s5, temp1.s5, temp2.s5, temp3.s5, temp4.s5, temp5.s5, temp6.s5, temp7.s5))); | ||||||
|  |     write_imagef(output, (i_3+6)*rows+j, as_float4((ushort8)(temp0.s6, temp1.s6, temp2.s6, temp3.s6, temp4.s6, temp5.s6, temp6.s6, temp7.s6))); | ||||||
|  |     write_imagef(output, (i_3+7)*rows+j, as_float4((ushort8)(temp0.s7, temp1.s7, temp2.s7, temp3.s7, temp4.s7, temp5.s7, temp6.s7, temp7.s7))); | ||||||
|  | } | ||||||
							
								
								
									
										25
									
								
								ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_32.cl
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										25
									
								
								ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_32.cl
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,25 @@ | |||||||
|  | // 32-bit transpose, loading/storing a 4x4 tile of elements | ||||||
|  |  | ||||||
|  | kernel void kernel_transpose_32( | ||||||
|  |     __read_only image1d_buffer_t input, | ||||||
|  |     __write_only image1d_buffer_t output, | ||||||
|  |     const uint rows, | ||||||
|  |     const uint cols | ||||||
|  | ) { | ||||||
|  |  | ||||||
|  |     const int i = get_global_id(0); | ||||||
|  |     const int j = get_global_id(1); | ||||||
|  |     const int i_2 = i<<2; | ||||||
|  |     const int j_2 = j<<2; | ||||||
|  |  | ||||||
|  |     float4 temp0 = read_imagef(input, (j_2+0)*cols+i); | ||||||
|  |     float4 temp1 = read_imagef(input, (j_2+1)*cols+i); | ||||||
|  |     float4 temp2 = read_imagef(input, (j_2+2)*cols+i); | ||||||
|  |     float4 temp3 = read_imagef(input, (j_2+3)*cols+i); | ||||||
|  |  | ||||||
|  |     write_imagef(output, (i_2+0)*rows+j, (float4)(temp0.s0, temp1.s0, temp2.s0, temp3.s0)); | ||||||
|  |     write_imagef(output, (i_2+1)*rows+j, (float4)(temp0.s1, temp1.s1, temp2.s1, temp3.s1)); | ||||||
|  |     write_imagef(output, (i_2+2)*rows+j, (float4)(temp0.s2, temp1.s2, temp2.s2, temp3.s2)); | ||||||
|  |     write_imagef(output, (i_2+3)*rows+j, (float4)(temp0.s3, temp1.s3, temp2.s3, temp3.s3)); | ||||||
|  |  | ||||||
|  | } | ||||||
							
								
								
									
										35
									
								
								ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_32_16.cl
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										35
									
								
								ggml/src/ggml-opencl/kernels/ggml-opencl_transpose_32_16.cl
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,35 @@ | |||||||
|  | // 32-bit transpose, loading/storing a 4x4 tile of elements | ||||||
|  | // Only used for activations | ||||||
|  | // converts to FP16 | ||||||
|  | // also adds zero padding for non multiple of 8 prompt lengths | ||||||
|  | #pragma OPENCL EXTENSION cl_khr_fp16 : enable | ||||||
|  |  | ||||||
|  | kernel void kernel_transpose_32_16(__read_only image1d_buffer_t input, __write_only image1d_buffer_t output, const uint rows, const uint cols, const uint padded_rows) { | ||||||
|  |  | ||||||
|  |     const int i = get_global_id(0); | ||||||
|  |     const int j = get_global_id(1); | ||||||
|  |     const int i_2 = i<<2; | ||||||
|  |     const int j_2 = j<<2; | ||||||
|  |     half4 temp0 = {0,0,0,0}; // initialize outputs to 0 | ||||||
|  |     half4 temp1 = {0,0,0,0}; | ||||||
|  |     half4 temp2 = {0,0,0,0}; | ||||||
|  |     half4 temp3 = {0,0,0,0}; | ||||||
|  |  | ||||||
|  |     if((j_2+0)*cols+i*4+3 < rows*cols*16){ // only load from a valid location. Otherwise keep register data as 0 | ||||||
|  |         temp0 = read_imageh(input, (j_2+0)*cols+i); | ||||||
|  |     } | ||||||
|  |     if((j_2+1)*cols+i*4+3 < rows*cols*16){ | ||||||
|  |         temp1 = read_imageh(input, (j_2+1)*cols+i); | ||||||
|  |     } | ||||||
|  |     if((j_2+2)*cols+i*4+3 < rows*cols*16){ | ||||||
|  |         temp2 = read_imageh(input, (j_2+2)*cols+i); | ||||||
|  |     } | ||||||
|  |     if((j_2+3)*cols+i*4+3 < rows*cols*16){ | ||||||
|  |         temp3 = read_imageh(input, (j_2+3)*cols+i); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     write_imageh(output, (i_2+0)*padded_rows+j, (half4)(temp0.s0, temp1.s0, temp2.s0, temp3.s0)); // no conditionals for output, includes zero padding | ||||||
|  |     write_imageh(output, (i_2+1)*padded_rows+j, (half4)(temp0.s1, temp1.s1, temp2.s1, temp3.s1)); | ||||||
|  |     write_imageh(output, (i_2+2)*padded_rows+j, (half4)(temp0.s2, temp1.s2, temp2.s2, temp3.s2)); | ||||||
|  |     write_imageh(output, (i_2+3)*padded_rows+j, (half4)(temp0.s3, temp1.s3, temp2.s3, temp3.s3)); | ||||||
|  | } | ||||||
		Reference in New Issue
	
	Block a user
	 lhez
					lhez