mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	musa: upgrade musa sdk to rc4.2.0 (#14498)
* musa: apply mublas API changes Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com> * musa: update musa version to 4.2.0 Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com> * musa: restore MUSA graph settings in CMakeLists.txt Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com> * musa: disable mudnnMemcpyAsync by default Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com> * musa: switch back to non-mudnn images Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com> * minor changes Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com> * musa: restore rc in docker image tag Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com> --------- Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>
This commit is contained in:
		| @@ -1,10 +1,10 @@ | |||||||
| ARG UBUNTU_VERSION=22.04 | ARG UBUNTU_VERSION=22.04 | ||||||
| # This needs to generally match the container host's environment. | # This needs to generally match the container host's environment. | ||||||
| ARG MUSA_VERSION=rc4.0.1 | ARG MUSA_VERSION=rc4.2.0 | ||||||
| # Target the MUSA build image | # Target the MUSA build image | ||||||
| ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-devel-ubuntu${UBUNTU_VERSION} | ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64 | ||||||
|  |  | ||||||
| ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-runtime-ubuntu${UBUNTU_VERSION} | ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64 | ||||||
|  |  | ||||||
| FROM ${BASE_MUSA_DEV_CONTAINER} AS build | FROM ${BASE_MUSA_DEV_CONTAINER} AS build | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										2
									
								
								.github/workflows/build.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/build.yml
									
									
									
									
										vendored
									
									
								
							| @@ -515,7 +515,7 @@ jobs: | |||||||
|  |  | ||||||
|   ubuntu-22-cmake-musa: |   ubuntu-22-cmake-musa: | ||||||
|     runs-on: ubuntu-22.04 |     runs-on: ubuntu-22.04 | ||||||
|     container: mthreads/musa:rc4.0.1-mudnn-devel-ubuntu22.04 |     container: mthreads/musa:rc4.2.0-devel-ubuntu22.04-amd64 | ||||||
|  |  | ||||||
|     steps: |     steps: | ||||||
|       - name: Clone |       - name: Clone | ||||||
|   | |||||||
| @@ -54,7 +54,7 @@ docker run --privileged -it \ | |||||||
|     -v $HOME/llama.cpp/ci-cache:/ci-cache \ |     -v $HOME/llama.cpp/ci-cache:/ci-cache \ | ||||||
|     -v $HOME/llama.cpp/ci-results:/ci-results \ |     -v $HOME/llama.cpp/ci-results:/ci-results \ | ||||||
|     -v $PWD:/ws -w /ws \ |     -v $PWD:/ws -w /ws \ | ||||||
|     mthreads/musa:rc4.0.1-mudnn-devel-ubuntu22.04 |     mthreads/musa:rc4.2.0-devel-ubuntu22.04-amd64 | ||||||
| ``` | ``` | ||||||
|  |  | ||||||
| Inside the container, execute the following commands: | Inside the container, execute the following commands: | ||||||
|   | |||||||
| @@ -110,7 +110,7 @@ You may want to pass in some different `ARGS`, depending on the MUSA environment | |||||||
|  |  | ||||||
| The defaults are: | The defaults are: | ||||||
|  |  | ||||||
| - `MUSA_VERSION` set to `rc4.0.1` | - `MUSA_VERSION` set to `rc4.2.0` | ||||||
|  |  | ||||||
| The resulting images, are essentially the same as the non-MUSA images: | The resulting images, are essentially the same as the non-MUSA images: | ||||||
|  |  | ||||||
|   | |||||||
| @@ -174,6 +174,8 @@ option(GGML_HIP_GRAPHS                      "ggml: use HIP graph, experimental, | |||||||
| option(GGML_HIP_NO_VMM                      "ggml: do not try to use HIP VMM"                 ON) | option(GGML_HIP_NO_VMM                      "ggml: do not try to use HIP VMM"                 ON) | ||||||
| option(GGML_HIP_ROCWMMA_FATTN               "ggml: enable rocWMMA for FlashAttention"         OFF) | option(GGML_HIP_ROCWMMA_FATTN               "ggml: enable rocWMMA for FlashAttention"         OFF) | ||||||
| option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12   "ggml: enable rocWMMA FlashAttention on GFX12"    OFF) | option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12   "ggml: enable rocWMMA FlashAttention on GFX12"    OFF) | ||||||
|  | option(GGML_MUSA_GRAPHS                     "ggml: use MUSA graph, experimental, unstable"    OFF) | ||||||
|  | option(GGML_MUSA_MUDNN_COPY                 "ggml: enable muDNN for accelerated copy"         OFF) | ||||||
| option(GGML_VULKAN                          "ggml: use Vulkan"                                OFF) | option(GGML_VULKAN                          "ggml: use Vulkan"                                OFF) | ||||||
| option(GGML_VULKAN_CHECK_RESULTS            "ggml: run Vulkan op checks"                      OFF) | option(GGML_VULKAN_CHECK_RESULTS            "ggml: run Vulkan op checks"                      OFF) | ||||||
| option(GGML_VULKAN_DEBUG                    "ggml: enable Vulkan debug output"                OFF) | option(GGML_VULKAN_DEBUG                    "ggml: enable Vulkan debug output"                OFF) | ||||||
|   | |||||||
| @@ -765,7 +765,7 @@ struct ggml_tensor_extra_gpu { | |||||||
| }; | }; | ||||||
|  |  | ||||||
|  |  | ||||||
| #if (defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)) | #if (defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)) || defined(GGML_MUSA_GRAPHS) | ||||||
| #define USE_CUDA_GRAPH | #define USE_CUDA_GRAPH | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1,9 +1,9 @@ | |||||||
| #include "cpy.cuh" | #include "cpy.cuh" | ||||||
| #include "dequantize.cuh" | #include "dequantize.cuh" | ||||||
| #include "cpy-utils.cuh" | #include "cpy-utils.cuh" | ||||||
| #ifdef GGML_USE_MUSA | #if defined(GGML_USE_MUSA) && defined(GGML_MUSA_MUDNN_COPY) | ||||||
| #include "ggml-musa/mudnn.cuh" | #include "ggml-musa/mudnn.cuh" | ||||||
| #endif // GGML_USE_MUSA | #endif // GGML_USE_MUSA && GGML_MUSA_MUDNN_COPY | ||||||
|  |  | ||||||
| typedef void (*cpy_kernel_t)(const char * cx, char * cdst); | typedef void (*cpy_kernel_t)(const char * cx, char * cdst); | ||||||
|  |  | ||||||
| @@ -121,7 +121,7 @@ static __global__ void cpy_q_f32(const char * cx, char * cdst_direct, const int | |||||||
| // Copy destination pointers to GPU to be available when pointer indirection is in use | // Copy destination pointers to GPU to be available when pointer indirection is in use | ||||||
|  |  | ||||||
| void ggml_cuda_cpy_dest_ptrs_copy(ggml_cuda_graph * cuda_graph, char ** host_dest_ptrs, const int host_dest_ptrs_size, cudaStream_t stream) { | void ggml_cuda_cpy_dest_ptrs_copy(ggml_cuda_graph * cuda_graph, char ** host_dest_ptrs, const int host_dest_ptrs_size, cudaStream_t stream) { | ||||||
| #if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) | #if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS) | ||||||
|     if (cuda_graph->dest_ptrs_size < host_dest_ptrs_size) { // (re-)allocate GPU memory for destination pointers |     if (cuda_graph->dest_ptrs_size < host_dest_ptrs_size) { // (re-)allocate GPU memory for destination pointers | ||||||
|         CUDA_CHECK(cudaStreamSynchronize(stream)); |         CUDA_CHECK(cudaStreamSynchronize(stream)); | ||||||
|         if (cuda_graph->dest_ptrs_d != nullptr) { |         if (cuda_graph->dest_ptrs_d != nullptr) { | ||||||
| @@ -314,7 +314,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg | |||||||
|  |  | ||||||
|     char ** dest_ptrs_d = nullptr; |     char ** dest_ptrs_d = nullptr; | ||||||
|     int graph_cpynode_index = -1; |     int graph_cpynode_index = -1; | ||||||
| #if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) | #if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS) | ||||||
|     if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) { |     if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) { | ||||||
|         dest_ptrs_d = ctx.cuda_graph->dest_ptrs_d; |         dest_ptrs_d = ctx.cuda_graph->dest_ptrs_d; | ||||||
|         graph_cpynode_index = ctx.cuda_graph->graph_cpynode_index; |         graph_cpynode_index = ctx.cuda_graph->graph_cpynode_index; | ||||||
| @@ -324,11 +324,11 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg | |||||||
| #endif | #endif | ||||||
|     if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) { |     if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) { | ||||||
|         GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1)); |         GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1)); | ||||||
| #ifdef GGML_USE_MUSA | #if defined(GGML_USE_MUSA) && defined(GGML_MUSA_MUDNN_COPY) | ||||||
|         if (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) { |         if (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) { | ||||||
|             CUDA_CHECK(mudnnMemcpyAsync(ctx, src1, src0)); |             CUDA_CHECK(mudnnMemcpyAsync(ctx, src1, src0)); | ||||||
|         } else |         } else | ||||||
| #endif // GGML_USE_MUSA | #endif // GGML_USE_MUSA && GGML_MUSA_MUDNN_COPY | ||||||
|         { |         { | ||||||
|             CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream)); |             CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream)); | ||||||
|         } |         } | ||||||
| @@ -379,7 +379,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg | |||||||
|         GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__, |         GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__, | ||||||
|                 ggml_type_name(src0->type), ggml_type_name(src1->type)); |                 ggml_type_name(src0->type), ggml_type_name(src1->type)); | ||||||
|     } |     } | ||||||
| #if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) | #if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS) | ||||||
|     if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) { |     if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) { | ||||||
|         ctx.cuda_graph->graph_cpynode_index = graph_cpynode_index; |         ctx.cuda_graph->graph_cpynode_index = graph_cpynode_index; | ||||||
|     } |     } | ||||||
|   | |||||||
							
								
								
									
										4
									
								
								ggml/src/ggml-cuda/vendors/musa.h
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										4
									
								
								ggml/src/ggml-cuda/vendors/musa.h
									
									
									
									
										vendored
									
									
								
							| @@ -13,7 +13,7 @@ | |||||||
| #define CUBLAS_OP_N MUBLAS_OP_N | #define CUBLAS_OP_N MUBLAS_OP_N | ||||||
| #define CUBLAS_OP_T MUBLAS_OP_T | #define CUBLAS_OP_T MUBLAS_OP_T | ||||||
| #define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS | #define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS | ||||||
| #define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_MATH_MODE_DEFAULT | #define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_TENSOR_OP_MATH | ||||||
| #define CUDA_R_16F  MUSA_R_16F | #define CUDA_R_16F  MUSA_R_16F | ||||||
| #define CUDA_R_16BF MUSA_R_16BF | #define CUDA_R_16BF MUSA_R_16BF | ||||||
| #define CUDA_R_32F  MUSA_R_32F | #define CUDA_R_32F  MUSA_R_32F | ||||||
| @@ -29,7 +29,7 @@ | |||||||
| #define cublasSgemm mublasSgemm | #define cublasSgemm mublasSgemm | ||||||
| #define cublasStatus_t mublasStatus_t | #define cublasStatus_t mublasStatus_t | ||||||
| #define cublasOperation_t mublasOperation_t | #define cublasOperation_t mublasOperation_t | ||||||
| #define cublasGetStatusString mublasStatus_to_string | #define cublasGetStatusString mublasGetStatusString | ||||||
| #define cudaDataType_t musaDataType_t | #define cudaDataType_t musaDataType_t | ||||||
| #define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer | #define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer | ||||||
| #define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess | #define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess | ||||||
|   | |||||||
| @@ -34,8 +34,12 @@ if (MUSAToolkit_FOUND) | |||||||
|     list(APPEND GGML_SOURCES_MUSA ${SRCS}) |     list(APPEND GGML_SOURCES_MUSA ${SRCS}) | ||||||
|     file(GLOB   SRCS "../ggml-cuda/template-instances/mmq*.cu") |     file(GLOB   SRCS "../ggml-cuda/template-instances/mmq*.cu") | ||||||
|     list(APPEND GGML_SOURCES_MUSA ${SRCS}) |     list(APPEND GGML_SOURCES_MUSA ${SRCS}) | ||||||
|  |  | ||||||
|  |     if (GGML_MUSA_MUDNN_COPY) | ||||||
|         file(GLOB   SRCS "../ggml-musa/*.cu") |         file(GLOB   SRCS "../ggml-musa/*.cu") | ||||||
|         list(APPEND GGML_SOURCES_MUSA ${SRCS}) |         list(APPEND GGML_SOURCES_MUSA ${SRCS}) | ||||||
|  |         add_compile_definitions(GGML_MUSA_MUDNN_COPY) | ||||||
|  |     endif() | ||||||
|  |  | ||||||
|     if (GGML_CUDA_FA_ALL_QUANTS) |     if (GGML_CUDA_FA_ALL_QUANTS) | ||||||
|         file(GLOB   SRCS "../ggml-cuda/template-instances/fattn-vec*.cu") |         file(GLOB   SRCS "../ggml-cuda/template-instances/fattn-vec*.cu") | ||||||
| @@ -72,6 +76,10 @@ if (MUSAToolkit_FOUND) | |||||||
|     add_compile_definitions(GGML_USE_MUSA) |     add_compile_definitions(GGML_USE_MUSA) | ||||||
|     add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE}) |     add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE}) | ||||||
|  |  | ||||||
|  |     if (GGML_MUSA_GRAPHS) | ||||||
|  |         add_compile_definitions(GGML_MUSA_GRAPHS) | ||||||
|  |     endif() | ||||||
|  |  | ||||||
|     if (GGML_CUDA_FORCE_MMQ) |     if (GGML_CUDA_FORCE_MMQ) | ||||||
|         add_compile_definitions(GGML_CUDA_FORCE_MMQ) |         add_compile_definitions(GGML_CUDA_FORCE_MMQ) | ||||||
|     endif() |     endif() | ||||||
| @@ -97,10 +105,16 @@ if (MUSAToolkit_FOUND) | |||||||
|     endif() |     endif() | ||||||
|  |  | ||||||
|     if (GGML_STATIC) |     if (GGML_STATIC) | ||||||
|         # TODO: mudnn has not provided static libraries yet |  | ||||||
|         target_link_libraries(ggml-musa PRIVATE MUSA::musart_static MUSA::mublas_static) |         target_link_libraries(ggml-musa PRIVATE MUSA::musart_static MUSA::mublas_static) | ||||||
|  |         # TODO: mudnn has not provided static libraries yet | ||||||
|  |         # if (GGML_MUSA_MUDNN_COPY) | ||||||
|  |         #     target_link_libraries(ggml-musa PRIVATE mudnn_static) | ||||||
|  |         # endif() | ||||||
|     else() |     else() | ||||||
|         target_link_libraries(ggml-musa PRIVATE MUSA::musart MUSA::mublas mudnn) |         target_link_libraries(ggml-musa PRIVATE MUSA::musart MUSA::mublas) | ||||||
|  |         if (GGML_MUSA_MUDNN_COPY) | ||||||
|  |             target_link_libraries(ggml-musa PRIVATE mudnn) | ||||||
|  |         endif() | ||||||
|     endif() |     endif() | ||||||
|  |  | ||||||
|     if (GGML_CUDA_NO_VMM) |     if (GGML_CUDA_NO_VMM) | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 R0CKSTAR
					R0CKSTAR