mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	CUDA: mmq CLI option, fixed mmq build issues (#2453)
This commit is contained in:
		| @@ -631,6 +631,9 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, | ||||
|     fprintf(stdout, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); | ||||
|     fprintf(stdout, "  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n"); | ||||
|     fprintf(stdout, "  -lv, --low-vram don't allocate VRAM scratch buffer\n"); | ||||
|     fprintf(stdout, "  -mmq, --mul-mat-q     use experimental mul_mat_q CUDA kernels instead of cuBLAS. TEMP!!!\n" ); | ||||
|     fprintf(stdout, "                        Reduces VRAM usage by 700/970/1430 MiB for 7b/13b/33b but prompt processing speed\n" ); | ||||
|     fprintf(stdout, "                        is still suboptimal, especially q2_K, q3_K, q5_K, and q6_K.\n" ); | ||||
| #endif | ||||
|     fprintf(stdout, "  -m FNAME, --model FNAME\n"); | ||||
|     fprintf(stdout, "                        model path (default: %s)\n", params.model.c_str()); | ||||
| @@ -827,7 +830,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, | ||||
|                 } | ||||
|             } | ||||
| #else | ||||
|             LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.", {}); | ||||
|             LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n", {}); | ||||
| #endif // GGML_USE_CUBLAS | ||||
|         } | ||||
|         else if (arg == "--low-vram" || arg == "-lv") | ||||
| @@ -835,7 +838,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, | ||||
| #ifdef GGML_USE_CUBLAS | ||||
|             params.low_vram = true; | ||||
| #else | ||||
|             fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n"); | ||||
|             LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n", {}); | ||||
| #endif // GGML_USE_CUBLAS | ||||
|         } | ||||
|         else if (arg == "--mul-mat-q" || arg == "-mmq") | ||||
|         { | ||||
| #ifdef GGML_USE_CUBLAS | ||||
|             params.mul_mat_q = true; | ||||
| #else | ||||
|             LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. It is not possible to use mul_mat_q kernels.\n", {}); | ||||
| #endif // GGML_USE_CUBLAS | ||||
|         } | ||||
|         else if (arg == "--main-gpu" || arg == "-mg") | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Johannes Gäßler
					Johannes Gäßler