mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-03 09:22:01 +00:00 
			
		
		
		
	metal : create backend, mostly reuse CPU backend interface
This commit is contained in:
		@@ -35,7 +35,7 @@ extern "C" {
 | 
				
			|||||||
#endif
 | 
					#endif
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// GG: maybe return ptr and avoid the "ggml.h" include
 | 
					// GG: maybe return ptr and avoid the "ggml.h" include
 | 
				
			||||||
struct ggml_backend ggml_backend_metal_init();
 | 
					struct ggml_backend ggml_backend_metal_init(struct ggml_backend * backend_cpu);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
//struct ggml_metal_context;
 | 
					//struct ggml_metal_context;
 | 
				
			||||||
//
 | 
					//
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										69
									
								
								ggml-metal.m
									
									
									
									
									
								
							
							
						
						
									
										69
									
								
								ggml-metal.m
									
									
									
									
									
								
							@@ -993,30 +993,59 @@ void ggml_metal_graph_compute(
 | 
				
			|||||||
    }
 | 
					    }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static const char * ggml_backend_metal_name(ggml_backend_context_t ctx) {
 | 
				
			||||||
 | 
					    return "Metal";
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    UNUSED(ctx);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					static void ggml_backend_metal_graph_compute(ggml_backend_context_t ctx, struct ggml_cgraph * cgraph) {
 | 
				
			||||||
 | 
					    struct ggml_metal_context * ctx_metal = (struct ggml_metal_context *) ctx;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    ggml_metal_graph_compute(ctx_metal, cgraph);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static struct ggml_backend_interface metal_backend_interface = {
 | 
					static struct ggml_backend_interface metal_backend_interface = {
 | 
				
			||||||
    /* .get_name            = */ //ggml_backend_metal_name,
 | 
					    /* .get_name            = */ ggml_backend_metal_name,
 | 
				
			||||||
    /* .free_context        = */ //ggml_backend_metal_free_context,
 | 
					    /* .free_context        = */ NULL, //ggml_backend_metal_free_context,
 | 
				
			||||||
    /* .alloc_buffer        = */ //ggml_backend_metal_alloc_buffer,
 | 
					    /* .alloc_buffer        = */ NULL, //ggml_backend_metal_alloc_buffer,
 | 
				
			||||||
    /* .free_buffer         = */ //ggml_backend_metal_free_buffer,
 | 
					    /* .free_buffer         = */ NULL, //ggml_backend_metal_free_buffer,
 | 
				
			||||||
    /* .reset_buffer        = */ //ggml_backend_metal_reset_buffer,
 | 
					    /* .reset_buffer        = */ NULL, //ggml_backend_metal_reset_buffer,
 | 
				
			||||||
    /* .alloc_tensor        = */ //ggml_backend_metal_alloc_tensor,
 | 
					    /* .alloc_tensor        = */ NULL, //ggml_backend_metal_alloc_tensor,
 | 
				
			||||||
    /* .set_tensor_async    = */ //ggml_backend_metal_set_tensor_async,
 | 
					    /* .set_tensor_async    = */ NULL, //ggml_backend_metal_set_tensor_async,
 | 
				
			||||||
    /* .get_tensor_async    = */ //ggml_backend_metal_get_tensor_async,
 | 
					    /* .get_tensor_async    = */ NULL, //ggml_backend_metal_get_tensor_async,
 | 
				
			||||||
    /* .synchronize         = */ //ggml_backend_metal_synchronize,
 | 
					    /* .synchronize         = */ NULL, //ggml_backend_metal_synchronize,
 | 
				
			||||||
    /* .cpy_tensor_from     = */ //nullptr,
 | 
					    /* .cpy_tensor_from     = */ NULL, //nullptr,
 | 
				
			||||||
    /* .cpy_tensor_to       = */ //nullptr,
 | 
					    /* .cpy_tensor_to       = */ NULL, //nullptr,
 | 
				
			||||||
    /* .graph_plan_create   = */ //ggml_backend_metal_graph_plan_create,
 | 
					    /* .graph_plan_create   = */ NULL, //ggml_backend_metal_graph_plan_create,
 | 
				
			||||||
    /* .graph_plan_free     = */ //ggml_backend_metal_graph_plan_free,
 | 
					    /* .graph_plan_free     = */ NULL, //ggml_backend_metal_graph_plan_free,
 | 
				
			||||||
    /* .graph_plan_compute  = */ //ggml_backend_metal_graph_plan_compute,
 | 
					    /* .graph_plan_compute  = */ NULL, //ggml_backend_metal_graph_plan_compute,
 | 
				
			||||||
    /* .graph_compute       = */ //ggml_backend_metal_graph_compute
 | 
					    /* .graph_compute       = */ ggml_backend_metal_graph_compute,
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
struct ggml_backend ggml_backend_metal_init(void) {
 | 
					struct ggml_backend ggml_backend_metal_init(struct ggml_backend * backend_cpu) {
 | 
				
			||||||
    struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
 | 
					    struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    struct ggml_backend metal_backend = {
 | 
					    struct ggml_backend backend_metal = {
 | 
				
			||||||
        /* .interface = */ &metal_backend_interface,
 | 
					        /* .interface     = */ &metal_backend_interface,
 | 
				
			||||||
        /* .context   = */ ctx
 | 
					        /* .context       = */ ctx,
 | 
				
			||||||
 | 
					        /* .is_ram_shared = */ true,
 | 
				
			||||||
    };
 | 
					    };
 | 
				
			||||||
    return metal_backend;
 | 
					
 | 
				
			||||||
 | 
					    // reuses CPU calls for now
 | 
				
			||||||
 | 
					    backend_metal.interface->free_context       = backend_cpu->interface->free_context;
 | 
				
			||||||
 | 
					    backend_metal.interface->alloc_buffer       = backend_cpu->interface->alloc_buffer;
 | 
				
			||||||
 | 
					    backend_metal.interface->free_buffer        = backend_cpu->interface->free_buffer;
 | 
				
			||||||
 | 
					    backend_metal.interface->reset_buffer       = backend_cpu->interface->reset_buffer;
 | 
				
			||||||
 | 
					    backend_metal.interface->alloc_tensor       = backend_cpu->interface->alloc_tensor;
 | 
				
			||||||
 | 
					    backend_metal.interface->set_tensor_async   = backend_cpu->interface->set_tensor_async;
 | 
				
			||||||
 | 
					    backend_metal.interface->get_tensor_async   = backend_cpu->interface->get_tensor_async;
 | 
				
			||||||
 | 
					    backend_metal.interface->synchronize        = backend_cpu->interface->synchronize;
 | 
				
			||||||
 | 
					    backend_metal.interface->cpy_tensor_from    = backend_cpu->interface->cpy_tensor_from;
 | 
				
			||||||
 | 
					    backend_metal.interface->cpy_tensor_to      = backend_cpu->interface->cpy_tensor_to;
 | 
				
			||||||
 | 
					    backend_metal.interface->graph_plan_create  = backend_cpu->interface->graph_plan_create;
 | 
				
			||||||
 | 
					    backend_metal.interface->graph_plan_free    = backend_cpu->interface->graph_plan_free;
 | 
				
			||||||
 | 
					    backend_metal.interface->graph_plan_compute = backend_cpu->interface->graph_plan_compute;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return backend_metal;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										19
									
								
								llama.cpp
									
									
									
									
									
								
							
							
						
						
									
										19
									
								
								llama.cpp
									
									
									
									
									
								
							@@ -968,7 +968,7 @@ static void llama_model_load_internal(
 | 
				
			|||||||
#endif
 | 
					#endif
 | 
				
			||||||
#ifdef GGML_USE_METAL
 | 
					#ifdef GGML_USE_METAL
 | 
				
			||||||
    if (n_gpu_layers > 0) {
 | 
					    if (n_gpu_layers > 0) {
 | 
				
			||||||
        model.backend_metal = ggml_backend_metal_init();
 | 
					        model.backend_metal = ggml_backend_metal_init(backend_cpu);
 | 
				
			||||||
        backend_gpu = &model.backend_metal;
 | 
					        backend_gpu = &model.backend_metal;
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
@@ -1008,17 +1008,20 @@ static void llama_model_load_internal(
 | 
				
			|||||||
    // TODO: generalize support for mmap
 | 
					    // TODO: generalize support for mmap
 | 
				
			||||||
    size_t mmap_size = 0;
 | 
					    size_t mmap_size = 0;
 | 
				
			||||||
    if (ml->use_mmap) {
 | 
					    if (ml->use_mmap) {
 | 
				
			||||||
        mmap_size = ctx_sizes[backend_cpu];
 | 
					        for (auto & it : ctx_sizes) {
 | 
				
			||||||
        ctx_sizes[backend_cpu] = 0;
 | 
					            if (it.first->is_ram_shared) {
 | 
				
			||||||
 | 
					                mmap_size += it.second;
 | 
				
			||||||
 | 
					                ctx_sizes[it.first] = 0;
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    fprintf(stderr, "%s: ggml ctx sizes:\n", __func__);
 | 
					    fprintf(stderr, "%s: ggml ctx sizes:\n", __func__);
 | 
				
			||||||
    for (const auto & it : ctx_sizes) {
 | 
					    for (const auto & it : ctx_sizes) {
 | 
				
			||||||
        fprintf(stderr, "%8s = %7.2f MB", ggml_backend_name(it.first), it.second / 1024.0 / 1024.0);
 | 
					        fprintf(stderr, "%8s = %7.2f MB\n", ggml_backend_name(it.first), it.second / 1024.0 / 1024.0);
 | 
				
			||||||
        if (it.first->is_ram_shared && ml->use_mmap) {
 | 
					    }
 | 
				
			||||||
            fprintf(stderr, " + %7.2f MB (mmap)", mmap_size / 1024.0 / 1024.0);
 | 
					    if (mmap_size > 0) {
 | 
				
			||||||
        }
 | 
					        fprintf(stderr, "%8s = %7.2f MB\n", "mmap", mmap_size / 1024.0 / 1024.0);
 | 
				
			||||||
        fprintf(stderr, "\n");
 | 
					 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // create the buffers and contexts
 | 
					    // create the buffers and contexts
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user