mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	cuda : improve cuda pool efficiency using virtual memory (#4606)
* cuda : improve cuda pool efficiency using virtual memory * fix mixtral * fix cmake build * check for vmm support, disable for hip ggml-ci * fix hip build * clarify granularity * move all caps to g_device_caps * refactor error checking * add cuda_pool_alloc, refactor most pool allocations ggml-ci * fix hip build * CUBLAS_TF32_TENSOR_OP_MATH is not a macro * more hip crap * llama : fix msvc warnings * ggml : fix msvc warnings * minor * minor * cuda : fallback to CPU on host buffer alloc fail * Update ggml-cuda.cu Co-authored-by: Johannes Gäßler <johannesg@5d6.de> * Update ggml-cuda.cu Co-authored-by: Johannes Gäßler <johannesg@5d6.de> * ensure allocations are always aligned * act_size -> actual_size --------- Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
This commit is contained in:
		| @@ -297,7 +297,7 @@ static void ggml_backend_registry_init(void) { | ||||
| void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) { | ||||
|     GGML_ASSERT(ggml_backend_registry_count < GGML_MAX_BACKENDS_REG); | ||||
|  | ||||
|     int id = ggml_backend_registry_count; | ||||
|     size_t id = ggml_backend_registry_count; | ||||
|  | ||||
|     ggml_backend_registry[id] = (struct ggml_backend_reg) { | ||||
|         /* .name                = */ {0}, | ||||
| @@ -330,6 +330,8 @@ size_t ggml_backend_reg_find_by_name(const char * name) { | ||||
|             return i; | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     // not found | ||||
|     return SIZE_MAX; | ||||
| } | ||||
|  | ||||
| @@ -340,15 +342,15 @@ ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str) | ||||
|     const char * params = strchr(backend_str, ':'); | ||||
|     char backend_name[128]; | ||||
|     if (params == NULL) { | ||||
|         strcpy(backend_name, backend_str); | ||||
|         snprintf(backend_name, sizeof(backend_name), "%s", backend_str); | ||||
|         params = ""; | ||||
|     } else { | ||||
|         strncpy(backend_name, backend_str, params - backend_str); | ||||
|         backend_name[params - backend_str] = '\0'; | ||||
|         snprintf(backend_name, sizeof(backend_name), "%.*s", (int)(params - backend_str), backend_str); | ||||
|         params++; | ||||
|     } | ||||
|  | ||||
|     size_t backend_i = ggml_backend_reg_find_by_name(backend_name); | ||||
|  | ||||
|     if (backend_i == SIZE_MAX) { | ||||
|         fprintf(stderr, "%s: backend %s not found\n", __func__, backend_name); | ||||
|         return NULL; | ||||
| @@ -396,18 +398,12 @@ static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) { | ||||
| } | ||||
|  | ||||
| static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { | ||||
|     GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); | ||||
|     GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); | ||||
|  | ||||
|     memcpy((char *)tensor->data + offset, data, size); | ||||
|  | ||||
|     GGML_UNUSED(buffer); | ||||
| } | ||||
|  | ||||
| static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { | ||||
|     GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds"); | ||||
|     GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); | ||||
|  | ||||
|     memcpy(data, (const char *)tensor->data + offset, size); | ||||
|  | ||||
|     GGML_UNUSED(buffer); | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 slaren
					slaren