mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	Support multiple GPUs (split mode) on SYCL backend (#5806)
* suport multiple cards: split-mode - layer|row * rm warning * rebase with master, support tow new OPs, close feature for -sm=row, fix for unit test * update news * fix merge error * update according to review comments
This commit is contained in:
		
							
								
								
									
										49
									
								
								llama.cpp
									
									
									
									
									
								
							
							
						
						
									
										49
									
								
								llama.cpp
									
									
									
									
									
								
							| @@ -104,6 +104,7 @@ | ||||
| #define LLAMA_MAX_NODES   8192 | ||||
| #define LLAMA_MAX_EXPERTS 8 | ||||
|  | ||||
|  | ||||
| // | ||||
| // logging | ||||
| // | ||||
| @@ -1429,7 +1430,9 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer | ||||
|         buft = ggml_backend_cuda_host_buffer_type(); | ||||
|     } | ||||
| #elif defined(GGML_USE_SYCL) | ||||
|     buft = ggml_backend_sycl_host_buffer_type(); | ||||
|     if (host_buffer) { | ||||
|         buft = ggml_backend_sycl_host_buffer_type(); | ||||
|     } | ||||
| #elif defined(GGML_USE_CPU_HBM) | ||||
|     buft = ggml_backend_cpu_hbm_buffer_type(); | ||||
| #elif defined(GGML_USE_VULKAN) | ||||
| @@ -1483,6 +1486,12 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g | ||||
|     } | ||||
| #endif | ||||
|  | ||||
| #ifdef GGML_USE_SYCL | ||||
|     if (ggml_backend_sycl_get_device_count() > 1) { | ||||
|         buft = ggml_backend_sycl_split_buffer_type(tensor_split); | ||||
|     } | ||||
| #endif | ||||
|  | ||||
|     if (buft == nullptr) { | ||||
|         buft = llama_default_buffer_type_offload(fallback_gpu); | ||||
|     } | ||||
| @@ -1494,6 +1503,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g | ||||
| static size_t llama_get_device_count() { | ||||
| #if defined(GGML_USE_CUBLAS) | ||||
|     return ggml_backend_cuda_get_device_count(); | ||||
| #elif defined(GGML_USE_SYCL) | ||||
|     return ggml_backend_sycl_get_device_count(); | ||||
| #elif defined(GGML_USE_VULKAN) | ||||
|     return ggml_backend_vk_get_device_count(); | ||||
| #else | ||||
| @@ -1507,6 +1518,11 @@ static size_t llama_get_device_memory(int device) { | ||||
|     size_t free; | ||||
|     ggml_backend_cuda_get_device_memory(device, &total, &free); | ||||
|     return free; | ||||
| #elif defined(GGML_USE_SYCL) | ||||
|     size_t total; | ||||
|     size_t free; | ||||
|     ggml_backend_sycl_get_device_memory(device, &total, &free); | ||||
|     return free; | ||||
| #elif defined(GGML_USE_VULKAN) | ||||
|     size_t total; | ||||
|     size_t free; | ||||
| @@ -12075,13 +12091,31 @@ struct llama_context * llama_new_context_with_model( | ||||
|         } | ||||
| #elif defined(GGML_USE_SYCL) | ||||
|         if (model->n_gpu_layers > 0) { | ||||
|             ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu); | ||||
|             if (backend == nullptr) { | ||||
|                 LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu); | ||||
|                 llama_free(ctx); | ||||
|                 return nullptr; | ||||
|             // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used | ||||
|             if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) { | ||||
|                 int main_gpu_index = ggml_backend_sycl_get_device_index(model->main_gpu); | ||||
|                 ggml_backend_t backend = ggml_backend_sycl_init(main_gpu_index); | ||||
|                 if (backend == nullptr) { | ||||
|                     LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, model->main_gpu, main_gpu_index); | ||||
|                     llama_free(ctx); | ||||
|                     return nullptr; | ||||
|                 } | ||||
|                 ctx->backends.push_back(backend); | ||||
|             } else { | ||||
|                 // LLAMA_SPLIT_LAYER requires a backend for each GPU | ||||
|                 int id_list[GGML_SYCL_MAX_DEVICES]; | ||||
|                 ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES); | ||||
|                 for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) { | ||||
|                     int device_id = id_list[i]; | ||||
|                     ggml_backend_t backend = ggml_backend_sycl_init(i); | ||||
|                     if (backend == nullptr) { | ||||
|                         LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, device_id, i); | ||||
|                         llama_free(ctx); | ||||
|                         return nullptr; | ||||
|                     } | ||||
|                     ctx->backends.push_back(backend); | ||||
|                 } | ||||
|             } | ||||
|             ctx->backends.push_back(backend); | ||||
|         } | ||||
| #elif defined(GGML_USE_KOMPUTE) | ||||
|         if (model->n_gpu_layers > 0) { | ||||
| @@ -12161,7 +12195,6 @@ struct llama_context * llama_new_context_with_model( | ||||
|             ggml_set_name(ctx->inp_cls,     "inp_cls"); | ||||
|  | ||||
|             ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true)); | ||||
|  | ||||
|             LLAMA_LOG_INFO("%s: %10s input buffer size   = %8.2f MiB\n", __func__, | ||||
|                     ggml_backend_buffer_name(ctx->buf_input), | ||||
|                     ggml_backend_buffer_get_size(ctx->buf_input) / 1024.0 / 1024.0); | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Neo Zhang Jianyu
					Neo Zhang Jianyu