mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	ggml : fix graph reallocation with multiple chunks (#16396)
reallocation is needed if a single chunk grows in size, even if total allocation size stays the same or is lower
This commit is contained in:
		| @@ -392,12 +392,8 @@ static void ggml_dyn_tallocr_free(struct ggml_dyn_tallocr * alloc) { | |||||||
|     free(alloc); |     free(alloc); | ||||||
| } | } | ||||||
|  |  | ||||||
| static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) { | static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc, int chunk) { | ||||||
|     size_t max_size = 0; |     return chunk < alloc->n_chunks ? alloc->chunks[chunk]->max_size : 0; | ||||||
|     for (int i = 0; i < alloc->n_chunks; i++) { |  | ||||||
|         max_size += alloc->chunks[i]->max_size; |  | ||||||
|     } |  | ||||||
|     return max_size; |  | ||||||
| } | } | ||||||
|  |  | ||||||
|  |  | ||||||
| @@ -417,10 +413,8 @@ static void ggml_vbuffer_free(struct vbuffer * buf) { | |||||||
|     free(buf); |     free(buf); | ||||||
| } | } | ||||||
|  |  | ||||||
| static int ggml_vbuffer_n_chunks(struct vbuffer * buf) { | static size_t ggml_vbuffer_chunk_size(struct vbuffer * buf, int chunk) { | ||||||
|     int n = 0; |     return buf->chunks[chunk] ? ggml_backend_buffer_get_size(buf->chunks[chunk]) : 0; | ||||||
|     while (n < GGML_VBUFFER_MAX_CHUNKS && buf->chunks[n]) n++; |  | ||||||
|     return n; |  | ||||||
| } | } | ||||||
|  |  | ||||||
| static size_t ggml_vbuffer_size(struct vbuffer * buf) { | static size_t ggml_vbuffer_size(struct vbuffer * buf) { | ||||||
| @@ -885,12 +879,20 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c | |||||||
|             } |             } | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         size_t cur_size = galloc->buffers[i] ? ggml_vbuffer_size(galloc->buffers[i]) : 0; |  | ||||||
|         size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]); |  | ||||||
|  |  | ||||||
|         // even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views |         // even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views | ||||||
|         if (new_size > cur_size || galloc->buffers[i] == NULL) { |         bool realloc = galloc->buffers[i] == NULL; | ||||||
|  |         size_t new_size = 0; | ||||||
|  |         for (int c = 0; c < galloc->buf_tallocs[i]->n_chunks; c++) { | ||||||
|  |             size_t cur_chunk_size = galloc->buffers[i] ? ggml_vbuffer_chunk_size(galloc->buffers[i], c) : 0; | ||||||
|  |             size_t new_chunk_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i], c); | ||||||
|  |             new_size += new_chunk_size; | ||||||
|  |             if (new_chunk_size > cur_chunk_size) { | ||||||
|  |                 realloc = true; | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |         if (realloc) { | ||||||
| #ifndef NDEBUG | #ifndef NDEBUG | ||||||
|  |             size_t cur_size = galloc->buffers[i] ? ggml_vbuffer_size(galloc->buffers[i]) : 0; | ||||||
|             GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); |             GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); | ||||||
| #endif | #endif | ||||||
|  |  | ||||||
|   | |||||||
| @@ -548,6 +548,41 @@ static void test_buffer_size_zero() { | |||||||
|     GGML_ASSERT(backend_b.context->allocated_total() == 0); |     GGML_ASSERT(backend_b.context->allocated_total() == 0); | ||||||
| } | } | ||||||
|  |  | ||||||
|  | // Test re-using gallocr for a different graph. The new graph has the same | ||||||
|  | // total size, but one of the chunks is larger, so reallocation is required. | ||||||
|  | static void test_reallocation() { | ||||||
|  |     dummy_backend    backend = dummy_backend_init(32, /*align*/ 4); | ||||||
|  |     ggml_gallocr_ptr galloc; | ||||||
|  |     { | ||||||
|  |         auto [ctx, graph, ctx_ptr] = make_context(); | ||||||
|  |         ggml_tensor * x[4]; | ||||||
|  |         x[0] = make_input_with_size(ctx, 24); | ||||||
|  |         x[1] = make_input_with_size(ctx, 16); | ||||||
|  |         x[2] = ggml_view_1d(ctx, x[0], 4, 0); | ||||||
|  |         x[3] = ggml_add(ctx, x[2], x[1]); | ||||||
|  |         assign_names(ctx); | ||||||
|  |  | ||||||
|  |         galloc = allocate_graph(graph, x[3], &backend.buffer_type); | ||||||
|  |         check_all_allocated(graph); | ||||||
|  |         GGML_ASSERT(backend.context->allocated_total() == 40); | ||||||
|  |     } | ||||||
|  |     { | ||||||
|  |         auto [ctx, graph, ctx_ptr] = make_context(); | ||||||
|  |         ggml_tensor * x[3]; | ||||||
|  |         x[0] = make_input_with_size(ctx, 20); | ||||||
|  |         x[1] = make_input_with_size(ctx, 20); | ||||||
|  |         x[2] = ggml_add(ctx, x[0], x[1]); | ||||||
|  |         assign_names(ctx); | ||||||
|  |         ggml_set_output(x[2]); | ||||||
|  |         ggml_build_forward_expand(graph, x[2]); | ||||||
|  |  | ||||||
|  |         bool result = ggml_gallocr_alloc_graph(galloc.get(), graph); | ||||||
|  |         GGML_ASSERT(result); | ||||||
|  |         check_all_allocated(graph); | ||||||
|  |         GGML_ASSERT(backend.context->allocated_total() == 40); | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
| static void run(const char * name, void (*f)()) { | static void run(const char * name, void (*f)()) { | ||||||
|     printf("%s ", name); |     printf("%s ", name); | ||||||
|     fflush(stdout); |     fflush(stdout); | ||||||
| @@ -568,5 +603,6 @@ int main() { | |||||||
|     run("test_prefer_already_allocated_memory", test_prefer_already_allocated_memory); |     run("test_prefer_already_allocated_memory", test_prefer_already_allocated_memory); | ||||||
|     run("test_multiple_buffer_types", test_multiple_buffer_types); |     run("test_multiple_buffer_types", test_multiple_buffer_types); | ||||||
|     run("test_buffer_size_zero", test_buffer_size_zero); |     run("test_buffer_size_zero", test_buffer_size_zero); | ||||||
|  |     run("test_reallocation", test_reallocation); | ||||||
|     return 0; |     return 0; | ||||||
| } | } | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Acly
					Acly