mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	Nomic Vulkan backend (#4456)
Signed-off-by: Jared Van Bortel <jared@nomic.ai> Co-authored-by: niansa <anton-sa@web.de> Co-authored-by: Adam Treat <treat.adam@gmail.com> Co-authored-by: Aaron Miller <apage43@ninjawhale.com> Co-authored-by: ToKiNoBug <tokinobug@163.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> Co-authored-by: slaren <slarengh@gmail.com>
This commit is contained in:
		
							
								
								
									
										35
									
								
								llama.cpp
									
									
									
									
									
								
							
							
						
						
									
										35
									
								
								llama.cpp
									
									
									
									
									
								
							| @@ -15,6 +15,8 @@ | ||||
| #  include "ggml-vulkan.h" | ||||
| #elif defined(GGML_USE_SYCL) | ||||
| #  include "ggml-sycl.h" | ||||
| #elif defined(GGML_USE_KOMPUTE) | ||||
| #   include "ggml-kompute.h" | ||||
| #endif | ||||
|  | ||||
| #ifdef GGML_USE_METAL | ||||
| @@ -1313,6 +1315,11 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) { | ||||
|     buft = ggml_backend_sycl_buffer_type(gpu); | ||||
| #elif defined(GGML_USE_CLBLAST) | ||||
|     buft = ggml_backend_opencl_buffer_type(); | ||||
| #elif defined(GGML_USE_KOMPUTE) | ||||
|     buft = ggml_backend_kompute_buffer_type(gpu); | ||||
|     if (buft == nullptr) { | ||||
|         LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu); | ||||
|     } | ||||
| #endif | ||||
|  | ||||
|     if (buft == nullptr) { | ||||
| @@ -4107,7 +4114,7 @@ static bool llm_load_tensors( | ||||
| } | ||||
|  | ||||
| // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback | ||||
| static int llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) { | ||||
| static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) { | ||||
|     try { | ||||
|         llama_model_loader ml(fname, params.use_mmap, params.kv_overrides); | ||||
|  | ||||
| @@ -4128,6 +4135,22 @@ static int llama_model_load(const std::string & fname, llama_model & model, cons | ||||
|             return 0; | ||||
|         } | ||||
|  | ||||
| #ifdef GGML_USE_KOMPUTE | ||||
|         if (ggml_vk_has_device() && params.n_gpu_layers > 0 && ( | ||||
|             !(model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) | ||||
|             || !( | ||||
|                 model.ftype == LLAMA_FTYPE_ALL_F32 || | ||||
|                 model.ftype == LLAMA_FTYPE_MOSTLY_F16 || | ||||
|                 model.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || | ||||
|                 model.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 | ||||
|             ) | ||||
|         )) { | ||||
|             // disable Vulkan due to unsupported model architecture or quantization type | ||||
|             // TODO(cebtenzzre): propagate this error outside of llama_load_model_from_file | ||||
|             params.n_gpu_layers = 0; | ||||
|         } | ||||
| #endif | ||||
|  | ||||
|         if (!llm_load_tensors( | ||||
|             ml, model, params.n_gpu_layers, params.split_mode,  params.main_gpu, params.tensor_split, params.use_mlock, | ||||
|             params.progress_callback, params.progress_callback_user_data | ||||
| @@ -10259,6 +10282,16 @@ struct llama_context * llama_new_context_with_model( | ||||
|             } | ||||
|             ctx->backends.push_back(backend); | ||||
|         } | ||||
| #elif defined(GGML_USE_KOMPUTE) | ||||
|         if (model->n_gpu_layers > 0) { | ||||
|             auto * backend = ggml_backend_kompute_init(model->main_gpu); | ||||
|             if (backend == nullptr) { | ||||
|                 LLAMA_LOG_ERROR("%s: failed to initialize Kompute backend\n", __func__); | ||||
|                 llama_free(ctx); | ||||
|                 return nullptr; | ||||
|             } | ||||
|             ctx->backends.push_back(backend); | ||||
|         } | ||||
| #endif | ||||
|         ctx->backend_cpu = ggml_backend_cpu_init(); | ||||
|         if (ctx->backend_cpu == nullptr) { | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Jared Van Bortel
					Jared Van Bortel