mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	| @@ -271,19 +271,32 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     // add extra buffer types | ||||
|     auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); | ||||
|     auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev); | ||||
|     auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t) | ||||
|         ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts"); | ||||
|     if (ggml_backend_dev_get_extra_bufts_fn) { | ||||
|         ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev); | ||||
|         while (extra_bufts && *extra_bufts) { | ||||
|             buft_list.emplace_back(cpu_dev, *extra_bufts); | ||||
|             ++extra_bufts; | ||||
|     bool has_gpu_device = false; | ||||
|     for (auto * dev : devices) { | ||||
|         if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) { | ||||
|             has_gpu_device = true; | ||||
|             break; | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     // add extra buffer types, only if no GPU device is present | ||||
|     // ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094 | ||||
|     if (!has_gpu_device) { | ||||
|         auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); | ||||
|         auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev); | ||||
|         auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t) | ||||
|             ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts"); | ||||
|         if (ggml_backend_dev_get_extra_bufts_fn) { | ||||
|             ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev); | ||||
|             while (extra_bufts && *extra_bufts) { | ||||
|                 buft_list.emplace_back(cpu_dev, *extra_bufts); | ||||
|                 ++extra_bufts; | ||||
|             } | ||||
|         } | ||||
|     } else { | ||||
|         LLAMA_LOG_WARN("%s: disabling extra buffer types (i.e. repacking) since a GPU device is available\n", __func__); | ||||
|     } | ||||
|  | ||||
|     // add a host buffer type | ||||
|     // storing the tensors in a host buffer is useful when the processing of large batches | ||||
|     // is offloaded to a GPU device, since it reduces the time spent on data transfers | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov