mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	CUDA: Fixed OpenLLaMA 3b mmq, reduced compile time (#2590)
This commit is contained in:
		| @@ -69,7 +69,6 @@ option(LLAMA_BLAS                            "llama: use BLAS" | ||||
| set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor") | ||||
| option(LLAMA_CUBLAS                          "llama: use CUDA"                                  OFF) | ||||
| #option(LLAMA_CUDA_CUBLAS                     "llama: use cuBLAS for prompt processing"          OFF) | ||||
| set(LLAMA_CUDA_MMQ_Y       "64" CACHE STRING "llama: y tile size for mmq CUDA kernels") | ||||
| option(LLAMA_CUDA_FORCE_DMMV                 "llama: use dmmv instead of mmvq CUDA kernels"     OFF) | ||||
| set(LLAMA_CUDA_DMMV_X      "32" CACHE STRING "llama: x stride for dmmv CUDA kernels") | ||||
| set(LLAMA_CUDA_MMV_Y        "1" CACHE STRING "llama: y block size for mmv CUDA kernels") | ||||
| @@ -256,7 +255,6 @@ if (LLAMA_CUBLAS) | ||||
| #        if (LLAMA_CUDA_CUBLAS) | ||||
| #            add_compile_definitions(GGML_CUDA_CUBLAS) | ||||
| #        endif() | ||||
|         add_compile_definitions(GGML_CUDA_MMQ_Y=${LLAMA_CUDA_MMQ_Y}) | ||||
|         if (LLAMA_CUDA_FORCE_DMMV) | ||||
|             add_compile_definitions(GGML_CUDA_FORCE_DMMV) | ||||
|         endif() | ||||
|   | ||||
							
								
								
									
										836
									
								
								ggml-cuda.cu
									
									
									
									
									
								
							
							
						
						
									
										836
									
								
								ggml-cuda.cu
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
		Reference in New Issue
	
	Block a user
	 Johannes Gäßler
					Johannes Gäßler