mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-29 08:41:22 +00:00 
			
		
		
		
	vulkan: Find optimal memory type but with fallback (#5381)
* @0cc4m feedback * More feedback @0cc4m
This commit is contained in:
		| @@ -707,9 +707,21 @@ static void ggml_vk_queue_cleanup(ggml_backend_vk_context * ctx, vk_queue& q) { | ||||
|     q.cmd_buffer_idx = 0; | ||||
| } | ||||
|  | ||||
| static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags) { | ||||
| static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_props, vk::MemoryRequirements* mem_req, vk::MemoryPropertyFlags flags) { | ||||
|     for (uint32_t i = 0; i < mem_props->memoryTypeCount; ++i) { | ||||
|         vk::MemoryType memory_type = mem_props->memoryTypes[i]; | ||||
|         if ((mem_req->memoryTypeBits & ((uint64_t)1 << i)) && | ||||
|             (flags & memory_type.propertyFlags) == flags && | ||||
|             mem_props->memoryHeaps[memory_type.heapIndex].size >= mem_req->size) { | ||||
|             return static_cast<int32_t>(i); | ||||
|         } | ||||
|     } | ||||
|     return UINT32_MAX; | ||||
| } | ||||
|  | ||||
| static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) { | ||||
| #ifdef GGML_VULKAN_DEBUG | ||||
|     std::cerr << "ggml_vk_create_buffer(" << size << ", " << to_string(req_flags) << ")" << std::endl; | ||||
|     std::cerr << "ggml_vk_create_buffer(" << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")" << std::endl; | ||||
| #endif | ||||
|     vk_buffer buf = std::make_shared<vk_buffer_struct>(); | ||||
|  | ||||
| @@ -736,15 +748,15 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz | ||||
|  | ||||
|     uint32_t memory_type_index = UINT32_MAX; | ||||
|  | ||||
|     for (uint32_t i = 0; i < mem_props.memoryTypeCount; ++i) { | ||||
|         vk::MemoryType memory_type = mem_props.memoryTypes[i]; | ||||
|         if ((mem_req.memoryTypeBits & ((uint64_t)1 << i)) && (req_flags & memory_type.propertyFlags) == req_flags && mem_props.memoryHeaps[memory_type.heapIndex].size >= mem_req.size) { | ||||
|             memory_type_index = i; | ||||
|             break; | ||||
|         } | ||||
|     memory_type_index = find_properties(&mem_props, &mem_req, req_flags); | ||||
|     buf->memory_property_flags = req_flags; | ||||
|  | ||||
|     if (memory_type_index == UINT32_MAX && fallback_flags) { | ||||
|         memory_type_index = find_properties(&mem_props, &mem_req, fallback_flags); | ||||
|         buf->memory_property_flags = fallback_flags; | ||||
|     } | ||||
|  | ||||
|     if (memory_type_index >= mem_props.memoryTypeCount) { | ||||
|     if (memory_type_index == UINT32_MAX) { | ||||
|         ctx->device.lock()->device.destroyBuffer(buf->buffer); | ||||
|         buf->size = 0; | ||||
|         throw vk::OutOfDeviceMemoryError("No suitable memory type found"); | ||||
| @@ -758,10 +770,9 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz | ||||
|         buf->size = 0; | ||||
|         throw e; | ||||
|     } | ||||
|     buf->memory_property_flags = req_flags; | ||||
|     buf->ptr = nullptr; | ||||
|  | ||||
|     if (req_flags & vk::MemoryPropertyFlagBits::eHostVisible) { | ||||
|     if (buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) { | ||||
|         buf->ptr = ctx->device.lock()->device.mapMemory(buf->device_memory, 0, VK_WHOLE_SIZE); | ||||
|     } | ||||
|  | ||||
| @@ -778,9 +789,9 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz | ||||
|     return buf; | ||||
| } | ||||
|  | ||||
| static vk_buffer ggml_vk_create_buffer_check(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags) { | ||||
| static vk_buffer ggml_vk_create_buffer_check(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) { | ||||
|     try { | ||||
|         return ggml_vk_create_buffer(ctx, size, req_flags); | ||||
|         return ggml_vk_create_buffer(ctx, size, req_flags, fallback_flags); | ||||
|     } catch (const vk::SystemError& e) { | ||||
|         std::cerr << "ggml_vulkan: Memory allocation of size " << size << " failed." << std::endl; | ||||
|         std::cerr << "ggml_vulkan: " << e.what() << std::endl; | ||||
| @@ -791,17 +802,17 @@ static vk_buffer ggml_vk_create_buffer_check(ggml_backend_vk_context * ctx, size | ||||
| static vk_buffer ggml_vk_create_buffer_device(ggml_backend_vk_context * ctx, size_t size) { | ||||
|     vk_buffer buf; | ||||
|     try { | ||||
|         buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal); | ||||
|     } catch (const vk::SystemError& e) { | ||||
|         if (ctx->device.lock()->uma) { | ||||
|             // Fall back to host memory type | ||||
|             buf = ggml_vk_create_buffer_check(ctx, size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); | ||||
|             buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); | ||||
|         } else { | ||||
|             buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal); | ||||
|         } | ||||
|     } catch (const vk::SystemError& e) { | ||||
|         std::cerr << "ggml_vulkan: Device memory allocation of size " << size << " failed." << std::endl; | ||||
|         std::cerr << "ggml_vulkan: " << e.what() << std::endl; | ||||
|         throw e; | ||||
|     } | ||||
|     } | ||||
|  | ||||
|     return buf; | ||||
| } | ||||
| @@ -1422,7 +1433,9 @@ static void * ggml_vk_host_malloc(ggml_backend_vk_context * ctx, size_t size) { | ||||
| #ifdef GGML_VULKAN_DEBUG | ||||
|     std::cerr << "ggml_vk_host_malloc(" << size << ")" << std::endl; | ||||
| #endif | ||||
|     vk_buffer buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached); | ||||
|     vk_buffer buf = ggml_vk_create_buffer(ctx, size, | ||||
|         vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached, | ||||
|         vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); | ||||
|  | ||||
|     if(!(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible)) { | ||||
|         fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory\n", | ||||
| @@ -1568,7 +1581,9 @@ static void deferred_memcpy(void * dst, const void * src, size_t size, std::vect | ||||
| static void ggml_vk_ensure_sync_staging_buffer(ggml_backend_vk_context * ctx, size_t size) { | ||||
|     if (ctx->sync_staging == nullptr || ctx->sync_staging->size < size) { | ||||
|         ggml_vk_destroy_buffer(ctx->sync_staging); | ||||
|         ctx->sync_staging = ggml_vk_create_buffer_check(ctx, size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached); | ||||
|         ctx->sync_staging = ggml_vk_create_buffer_check(ctx, size, | ||||
|             vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached, | ||||
|             vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); | ||||
|     } | ||||
| } | ||||
|  | ||||
| @@ -4082,7 +4097,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { | ||||
|     std::cerr << "ggml_vk_preallocate_buffers(qx_size: " << ctx->prealloc_size_qx << " qy_size: " << ctx->prealloc_size_qy << " x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << ")" << std::endl; | ||||
| #endif | ||||
| #if defined(GGML_VULKAN_RUN_TESTS) | ||||
|     ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached); | ||||
|     ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul, | ||||
|         vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached | ||||
|         vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); | ||||
|     ggml_vk_test_transfer(ctx, 8192 * 1000, false); | ||||
|     ggml_vk_test_transfer(ctx, 8192 * 1000, true); | ||||
|  | ||||
| @@ -4174,7 +4191,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { | ||||
|         if (ctx->staging != nullptr) { | ||||
|             ggml_vk_destroy_buffer(ctx->staging); | ||||
|         } | ||||
|         ctx->staging = ggml_vk_create_buffer_check(ctx, ctx->staging_size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached); | ||||
|         ctx->staging = ggml_vk_create_buffer_check(ctx, ctx->staging_size, | ||||
|             vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostCached, | ||||
|             vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); | ||||
|     } | ||||
| } | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Neuman Vong
					Neuman Vong