mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-03 09:22:01 +00:00 
			
		
		
		
	improved memory management fixes
This commit is contained in:
		@@ -7,6 +7,9 @@
 | 
			
		||||
 | 
			
		||||
#define UNUSED(x) (void)(x)
 | 
			
		||||
 | 
			
		||||
//#define AT_PRINTF printf
 | 
			
		||||
#define AT_PRINTF(...) ((void)0)
 | 
			
		||||
 | 
			
		||||
// allocator
 | 
			
		||||
 | 
			
		||||
static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
 | 
			
		||||
@@ -146,16 +149,16 @@ void ggml_allocator_default_alloc_tensor(struct ggml_backend_buffer * alloc, str
 | 
			
		||||
    /////
 | 
			
		||||
    if (alloc->measure && allocator_ctx->size != MAX_SIZE_INIT) {
 | 
			
		||||
        allocator_ctx->size = MAX_SIZE_INIT;
 | 
			
		||||
        //allocator_ctx->data = 0;
 | 
			
		||||
        allocator_ctx->data = 0x1000;
 | 
			
		||||
        allocator_ctx->free_blocks[0].size = MAX_SIZE_INIT;
 | 
			
		||||
        //allocator_ctx->free_blocks[0].addr = 0;
 | 
			
		||||
        allocator_ctx->free_blocks[0].addr = 0x1000;
 | 
			
		||||
    }
 | 
			
		||||
    /////
 | 
			
		||||
 | 
			
		||||
    size_t size = ggml_backend_buffer_get_alloc_size(alloc, tensor);
 | 
			
		||||
    size = aligned_offset(NULL, size, allocator_ctx->alignment);
 | 
			
		||||
 | 
			
		||||
    // printf("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
 | 
			
		||||
    AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
 | 
			
		||||
 | 
			
		||||
    size_t max_avail = 0;
 | 
			
		||||
 | 
			
		||||
@@ -173,7 +176,7 @@ void ggml_allocator_default_alloc_tensor(struct ggml_backend_buffer * alloc, str
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // printf("block %d\n", best_fit_block);
 | 
			
		||||
    AT_PRINTF("block %d\n", best_fit_block);
 | 
			
		||||
 | 
			
		||||
    if (best_fit_block == -1) {
 | 
			
		||||
        fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
 | 
			
		||||
@@ -217,7 +220,8 @@ void ggml_allocator_default_free_tensor(struct ggml_backend_buffer * alloc, stru
 | 
			
		||||
 | 
			
		||||
    size_t size = ggml_backend_buffer_get_alloc_size(alloc, tensor);
 | 
			
		||||
    size = aligned_offset(NULL, size, allocator_ctx->alignment);
 | 
			
		||||
    //printf("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, allocator_ctx->n_free_blocks);
 | 
			
		||||
    AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, allocator_ctx->n_free_blocks);
 | 
			
		||||
    tensor->freed = true;
 | 
			
		||||
 | 
			
		||||
    // see if we can merge with an existing block
 | 
			
		||||
    for (int i = 0; i < allocator_ctx->n_free_blocks; i++) {
 | 
			
		||||
@@ -826,11 +830,13 @@ void ggml_graph_allocate_tensors_n(struct ggml_cgraph ** graphs, int n_graphs, s
 | 
			
		||||
            struct ggml_tensor * node = gf->nodes[i];
 | 
			
		||||
            node->n_children = 0;
 | 
			
		||||
            node->n_views = 0;
 | 
			
		||||
            //node->freed = false;
 | 
			
		||||
        }
 | 
			
		||||
        for (int i = 0; i < gf->n_leafs; i++) {
 | 
			
		||||
            struct ggml_tensor * leaf = gf->leafs[i];
 | 
			
		||||
            leaf->n_children = 0;
 | 
			
		||||
            leaf->n_views = 0;
 | 
			
		||||
            //leaf->freed = false;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
@@ -839,6 +845,13 @@ void ggml_graph_allocate_tensors_n(struct ggml_cgraph ** graphs, int n_graphs, s
 | 
			
		||||
        struct ggml_cgraph * gf = graphs[g];
 | 
			
		||||
        for (int i = 0; i < gf->n_nodes; i++) {
 | 
			
		||||
            struct ggml_tensor * node = gf->nodes[i];
 | 
			
		||||
            if (ggml_is_view(node)) {
 | 
			
		||||
                struct ggml_tensor * ancestor = node;
 | 
			
		||||
                do {
 | 
			
		||||
                    ancestor = view_parent(ancestor);
 | 
			
		||||
                } while (ggml_is_view(ancestor));
 | 
			
		||||
                ancestor->n_views += 1;
 | 
			
		||||
            }
 | 
			
		||||
            for (int j = 0; j < GGML_MAX_SRC; j++) {
 | 
			
		||||
                struct ggml_tensor * parent = node->src[j];
 | 
			
		||||
                if (parent == NULL) {
 | 
			
		||||
@@ -869,47 +882,74 @@ void ggml_graph_allocate_tensors_n(struct ggml_cgraph ** graphs, int n_graphs, s
 | 
			
		||||
                if (parent == NULL) {
 | 
			
		||||
                    break;
 | 
			
		||||
                }
 | 
			
		||||
                if (parent->freed) {
 | 
			
		||||
                    printf("!!!!!! tensor %s used after free\n", parent->name);
 | 
			
		||||
                }
 | 
			
		||||
                if (ggml_is_view(parent)) {
 | 
			
		||||
                    struct ggml_tensor * ancestor = parent;
 | 
			
		||||
                    do {
 | 
			
		||||
                        ancestor = view_parent(ancestor);
 | 
			
		||||
                    } while (ggml_is_view(ancestor));
 | 
			
		||||
                    if (ancestor->freed) {
 | 
			
		||||
                        printf("!!!!!! tensor %s used after free (as view %s)\n", ancestor->name, parent->name);
 | 
			
		||||
                    }
 | 
			
		||||
                    allocate_node(buffer, ancestor);
 | 
			
		||||
                }
 | 
			
		||||
                allocate_node(buffer, parent);
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            // allocate node
 | 
			
		||||
            allocate_node(buffer, node);
 | 
			
		||||
 | 
			
		||||
            // update parents
 | 
			
		||||
            if (is_view) {
 | 
			
		||||
                struct ggml_tensor * ancestor = node;
 | 
			
		||||
                do {
 | 
			
		||||
                    ancestor = view_parent(ancestor);
 | 
			
		||||
                } while (ggml_is_view(ancestor));
 | 
			
		||||
                ancestor->n_views -= 1;
 | 
			
		||||
                if (ancestor->n_views == 0) {
 | 
			
		||||
                    ggml_backend_buffer_tensor_free(buffer->backend_buffer, ancestor);
 | 
			
		||||
            AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
 | 
			
		||||
            for (int j = 0; j < GGML_MAX_SRC; j++) {
 | 
			
		||||
                struct ggml_tensor * parent = node->src[j];
 | 
			
		||||
                if (parent == NULL) {
 | 
			
		||||
                    break;
 | 
			
		||||
                }
 | 
			
		||||
            } else {
 | 
			
		||||
                for (int j = 0; j < GGML_MAX_SRC; j++) {
 | 
			
		||||
                    struct ggml_tensor * parent = node->src[j];
 | 
			
		||||
                    if (parent == NULL) {
 | 
			
		||||
                        break;
 | 
			
		||||
                    }
 | 
			
		||||
                AT_PRINTF("%s", parent->name);
 | 
			
		||||
                if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
 | 
			
		||||
                    AT_PRINTF(", ");
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
            AT_PRINTF("\n");
 | 
			
		||||
 | 
			
		||||
            // update parents
 | 
			
		||||
            for (int j = 0; j < GGML_MAX_SRC; j++) {
 | 
			
		||||
                struct ggml_tensor * parent = node->src[j];
 | 
			
		||||
                if (parent == NULL) {
 | 
			
		||||
                    break;
 | 
			
		||||
                }
 | 
			
		||||
                parent->n_children -= 1;
 | 
			
		||||
                if (parent->n_children == 0 && parent->n_views == 0) {
 | 
			
		||||
                    if (ggml_is_view(parent)) {
 | 
			
		||||
                        struct ggml_tensor * ancestor = parent;
 | 
			
		||||
                        do {
 | 
			
		||||
                            ancestor = view_parent(ancestor);
 | 
			
		||||
                        } while (ggml_is_view(ancestor));
 | 
			
		||||
                        ancestor->n_views -= 1;
 | 
			
		||||
                        if (ancestor->n_views == 0) {
 | 
			
		||||
                        if (ancestor->n_views == 0 && ancestor->n_children == 0) {
 | 
			
		||||
                            ggml_backend_buffer_tensor_free(buffer->backend_buffer, ancestor);
 | 
			
		||||
                        }
 | 
			
		||||
                    }
 | 
			
		||||
                    else {
 | 
			
		||||
                        parent->n_children -= 1;
 | 
			
		||||
                        if (parent->n_children == 0) {
 | 
			
		||||
                            // free parent
 | 
			
		||||
                            ggml_backend_buffer_tensor_free(buffer->backend_buffer, parent);
 | 
			
		||||
                        }
 | 
			
		||||
                        ggml_backend_buffer_tensor_free(buffer->backend_buffer, parent);
 | 
			
		||||
                    }
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            if (is_view) {
 | 
			
		||||
                struct ggml_tensor * ancestor = node;
 | 
			
		||||
                do {
 | 
			
		||||
                    ancestor = view_parent(ancestor);
 | 
			
		||||
                } while (ggml_is_view(ancestor));
 | 
			
		||||
                ancestor->n_views -= 1;
 | 
			
		||||
                if (ancestor->n_views == 0 && ancestor->n_children == 0) {
 | 
			
		||||
                    ggml_backend_buffer_tensor_free(buffer->backend_buffer, ancestor);
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            AT_PRINTF("\n");
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										1
									
								
								ggml.c
									
									
									
									
									
								
							
							
						
						
									
										1
									
								
								ggml.c
									
									
									
									
									
								
							@@ -4533,6 +4533,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
 | 
			
		||||
        /*.node_id      =*/ -1,
 | 
			
		||||
        /*.n_children   =*/ 0,
 | 
			
		||||
        /*.n_views      =*/ 0,
 | 
			
		||||
        /*.freed        =*/ false,
 | 
			
		||||
        /*.perf_runs    =*/ 0,
 | 
			
		||||
        /*.perf_cycles  =*/ 0,
 | 
			
		||||
        /*.perf_time_us =*/ 0,
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										3
									
								
								ggml.h
									
									
									
									
									
								
							
							
						
						
									
										3
									
								
								ggml.h
									
									
									
									
									
								
							@@ -425,6 +425,7 @@ extern "C" {
 | 
			
		||||
        int node_id; // used to build graphs
 | 
			
		||||
        int n_children;
 | 
			
		||||
        int n_views;
 | 
			
		||||
        bool freed; // debug
 | 
			
		||||
 | 
			
		||||
        // performance
 | 
			
		||||
        int     perf_runs;
 | 
			
		||||
@@ -437,7 +438,7 @@ extern "C" {
 | 
			
		||||
 | 
			
		||||
        void * extra; // extra things e.g. for ggml-cuda.cu
 | 
			
		||||
 | 
			
		||||
        char padding[12];
 | 
			
		||||
        char padding[8];
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
    static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
 | 
			
		||||
 
 | 
			
		||||
@@ -703,7 +703,9 @@ static bool kv_cache_init(
 | 
			
		||||
    const int64_t n_mem      = n_layer*n_ctx;
 | 
			
		||||
    const int64_t n_elements = n_embd*n_mem;
 | 
			
		||||
 | 
			
		||||
    size_t size = 2u*n_elements*ggml_type_size(wtype) + 2u*MB;
 | 
			
		||||
    size_t size = 2u*n_elements*ggml_type_size(wtype);
 | 
			
		||||
 | 
			
		||||
    fprintf(stderr, "%s: allocating %.2f MB for kv cache\n", __func__, size / 1024.0 / 1024.0);
 | 
			
		||||
 | 
			
		||||
    cache.buf = ggml_buffer_alloc(backend, size, 2);
 | 
			
		||||
    cache.n = 0;
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user