mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	ggml : add is_ram_shared to ggml_backend
Metal can share the RAM memory and can utilize mmap without temp buffer
This commit is contained in:
		| @@ -255,8 +255,9 @@ struct ggml_backend ggml_backend_cpu_init(void) { | ||||
|     ctx->work_size = 0; | ||||
|  | ||||
|     struct ggml_backend cpu_backend = { | ||||
|         /* .interface = */ &cpu_backend_interface, | ||||
|         /* .context   = */ ctx | ||||
|         /* .interface     = */ &cpu_backend_interface, | ||||
|         /* .context       = */ ctx, | ||||
|         /* .is_ram_shared = */ true, | ||||
|     }; | ||||
|     return cpu_backend; | ||||
| } | ||||
|   | ||||
| @@ -61,7 +61,10 @@ extern "C" { | ||||
|  | ||||
|     struct ggml_backend { | ||||
|         struct ggml_backend_interface * interface; | ||||
|  | ||||
|         ggml_backend_context_t context; | ||||
|  | ||||
|         bool is_ram_shared; | ||||
|     }; | ||||
|  | ||||
|     // backend helper functions | ||||
| @@ -78,7 +81,16 @@ extern "C" { | ||||
|     static inline void ggml_backend_graph_compute(struct ggml_backend * backend, struct ggml_cgraph * cgraph) { backend->interface->graph_compute(backend->context, cgraph); } | ||||
|  | ||||
|     // buffer and tensor allocation | ||||
|     GGML_API struct ggml_buffer ggml_backend_alloc_buffer(struct ggml_backend * backend, size_t size, size_t max_tensors); // GG: probably return ptr | ||||
|     // TODO: | ||||
|     //  - return "struct ggml_buffer *" | ||||
|     //  - fix namings: | ||||
|     //    - ggml_backend_alloc_buffer -> ggml_backend_buffer_alloc | ||||
|     //    - ggml_backend_free_buffer  -> ggml_backend_buffer_free | ||||
|     //    - ggml_backend_reset_buffer -> ggml_backend_buffer_reset | ||||
|     //    - ggml_backend_alloc_tensor -> ggml_backend_tensor_alloc | ||||
|     //    - ggml_backend_tensor_cpy   -> ggml_backend_tensor_copy | ||||
|     // | ||||
|     GGML_API struct ggml_buffer ggml_backend_alloc_buffer(struct ggml_backend * backend, size_t size, size_t max_tensors); | ||||
|     GGML_API void               ggml_backend_free_buffer(struct ggml_buffer * buffer); | ||||
|     static inline void          ggml_backend_reset_buffer(struct ggml_buffer * buffer) { buffer->backend->interface->reset_buffer(buffer->backend->context, buffer->backend_buffer); } | ||||
|     static inline void          ggml_backend_alloc_tensor(struct ggml_buffer * buffer, struct ggml_tensor * tensor) { buffer->backend->interface->alloc_tensor(buffer->backend->context, buffer->backend_buffer, tensor); } | ||||
|   | ||||
| @@ -1834,8 +1834,9 @@ ggml_backend ggml_backend_cuda_init(void) { | ||||
|     ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context; | ||||
|  | ||||
|     ggml_backend cuda_backend = { | ||||
|         /* .interface = */ &cuda_backend_interface, | ||||
|         /* .context   = */ ctx | ||||
|         /* .interface =   = */ &cuda_backend_interface, | ||||
|         /* .context       = */ ctx, | ||||
|         /* .is_ram_shared = */ false, | ||||
|     }; | ||||
|     return cuda_backend; | ||||
| } | ||||
|   | ||||
							
								
								
									
										65
									
								
								llama.cpp
									
									
									
									
									
								
							
							
						
						
									
										65
									
								
								llama.cpp
									
									
									
									
									
								
							| @@ -225,6 +225,7 @@ struct llama_model { | ||||
|     llama_vocab vocab; | ||||
|  | ||||
|     // backends | ||||
|     // TODO: change to pointers | ||||
|     ggml_backend   backend_cpu; | ||||
|     ggml_buffer    buf_cpu; | ||||
|     ggml_context * ctx_cpu = NULL; | ||||
| @@ -298,6 +299,7 @@ struct llama_context { | ||||
|  | ||||
|     // memory buffers used to evaluate the model | ||||
|     ggml_buffer buf_compute_cpu = {}; | ||||
|  | ||||
| #ifdef GGML_USE_CUDA | ||||
|     ggml_buffer buf_compute_cuda = {}; | ||||
| #endif | ||||
| @@ -612,7 +614,7 @@ struct llama_model_loader { | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     void load_all_data(llama_progress_callback progress_callback, void *  progress_callback_user_data, llama_mlock * lmlock) { | ||||
|     void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) { | ||||
|         size_t data_size = 0; | ||||
|         size_t lock_size = 0; | ||||
|         for (const llama_load_tensor & lt : tensors_map.tensors) { | ||||
| @@ -634,11 +636,11 @@ struct llama_model_loader { | ||||
|             } | ||||
|             LLAMA_ASSERT(lt.ggml_tensor); // unused tensors should have been caught by load_data already | ||||
|  | ||||
|             bool is_cpu = lt.ggml_tensor->backend == &model->backend_cpu; | ||||
|             const bool is_ram_shared = lt.ggml_tensor->backend->is_ram_shared; | ||||
|  | ||||
|             // select buffer to load data into | ||||
|             if (!use_mmap) { | ||||
|                 if (is_cpu) { | ||||
|                 if (is_ram_shared) { | ||||
|                     lt.data = (uint8_t *) lt.ggml_tensor->data; | ||||
|                 } else { | ||||
|                     // read to temporary buffer | ||||
| @@ -649,7 +651,7 @@ struct llama_model_loader { | ||||
|  | ||||
|             load_data_for(lt); | ||||
|  | ||||
|             if (is_cpu) { | ||||
|             if (is_ram_shared) { | ||||
|                 if (use_mmap) { | ||||
|                     lt.ggml_tensor->data = lt.data; | ||||
|                     // TODO: this assumes that the data to lock is contiguous, which may not always be the case | ||||
| @@ -671,7 +673,7 @@ struct llama_model_loader { | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     void load_data_for(llama_load_tensor & lt) { | ||||
|     void load_data_for(llama_load_tensor & lt) const { | ||||
|         if (use_mmap) { | ||||
|             lt.data = (uint8_t *) mapping->addr + lt.file_off; | ||||
|         } else { | ||||
| @@ -957,6 +959,7 @@ static void llama_model_load_internal( | ||||
|  | ||||
|     ggml_backend * backend_cpu = &model.backend_cpu; | ||||
|     ggml_backend * backend_gpu = &model.backend_cpu; // hack until we have a proper backend selection | ||||
|  | ||||
| #ifdef GGML_USE_CUDA | ||||
|     if (n_gpu_layers > 0) { | ||||
|         model.backend_cuda = ggml_backend_cuda_init(); | ||||
| @@ -965,13 +968,14 @@ static void llama_model_load_internal( | ||||
| #endif | ||||
| #ifdef GGML_USE_METAL | ||||
|     if (n_gpu_layers > 0) { | ||||
|         model.backend_metal = ggml_backend_cpu_init(); | ||||
|         model.backend_metal = ggml_backend_metal_init(); | ||||
|         backend_gpu = &model.backend_metal; | ||||
|     } | ||||
| #endif | ||||
|  | ||||
|     // assign splits to the backends | ||||
|     const int i_gpu_start = std::max(0, (int)n_layer - n_gpu_layers); | ||||
|  | ||||
|     model.backend_inp = n_gpu_layers > (int)n_layer ? backend_gpu : backend_cpu; | ||||
|     model.backend_out = n_gpu_layers > 0            ? backend_gpu : backend_cpu; | ||||
|  | ||||
| @@ -1011,7 +1015,7 @@ static void llama_model_load_internal( | ||||
|     fprintf(stderr, "%s: ggml ctx sizes:\n", __func__); | ||||
|     for (const auto & it : ctx_sizes) { | ||||
|         fprintf(stderr, "%8s = %7.2f MB", ggml_backend_name(it.first), it.second / 1024.0 / 1024.0); | ||||
|         if (it.first == backend_cpu && ml->use_mmap) { | ||||
|         if (it.first->is_ram_shared && ml->use_mmap) { | ||||
|             fprintf(stderr, " + %7.2f MB (mmap)", mmap_size / 1024.0 / 1024.0); | ||||
|         } | ||||
|         fprintf(stderr, "\n"); | ||||
| @@ -1135,12 +1139,10 @@ static void llama_model_load_internal( | ||||
|             ctx_sum += it.second; | ||||
|         } | ||||
|  | ||||
|         const size_t mem_required = | ||||
|             ctx_sum + MEM_REQ_EVAL().at(model.type); | ||||
|         const size_t mem_required = ctx_sum + MEM_REQ_EVAL().at(model.type); | ||||
|  | ||||
|         // this is the memory required by one llama_state | ||||
|         const size_t mem_required_state = | ||||
|             scale*MEM_REQ_KV_SELF().at(model.type); | ||||
|         const size_t mem_required_state = scale*MEM_REQ_KV_SELF().at(model.type); | ||||
|  | ||||
|         fprintf(stderr, "%s: mem required  = %7.2f MB (+ %7.2f MB per state)\n", __func__, | ||||
|                 mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0); | ||||
| @@ -1162,6 +1164,7 @@ static void llama_model_load_internal( | ||||
|     // loading time will be recalculate after the first eval, so | ||||
|     // we take page faults deferred by mmap() into consideration | ||||
|     model.t_load_us = ggml_time_us() - model.t_start_us; | ||||
|  | ||||
| } | ||||
|  | ||||
| static bool llama_model_load( | ||||
| @@ -1226,6 +1229,7 @@ static ggml_graph_splits llama_build_graph( | ||||
|     // initialize contexts for every backend | ||||
|  | ||||
|     struct ggml_context * ctx_cpu = nullptr; | ||||
|  | ||||
|     if (lctx.buf_compute_cpu.mem_size > 0) { | ||||
|         struct ggml_init_params params = ggml_init_params_default(); | ||||
|         params.buffer = &lctx.buf_compute_cpu; | ||||
| @@ -1235,6 +1239,7 @@ static ggml_graph_splits llama_build_graph( | ||||
|  | ||||
| #ifdef GGML_USE_CUDA | ||||
|     struct ggml_context * ctx_cuda = nullptr; | ||||
|  | ||||
|     if (lctx.buf_compute_cuda.mem_size > 0) { | ||||
|         struct ggml_init_params params = ggml_init_params_default(); | ||||
|         params.buffer = &lctx.buf_compute_cuda; | ||||
| @@ -1243,30 +1248,54 @@ static ggml_graph_splits llama_build_graph( | ||||
|     } | ||||
| #endif | ||||
|  | ||||
| #ifdef GGML_USE_METAL | ||||
|     struct ggml_context * ctx_metal = nullptr; | ||||
|  | ||||
|     if (lctx.buf_compute_metal.mem_size > 0) { | ||||
|         struct ggml_init_params params = ggml_init_params_default(); | ||||
|         params.buffer = &lctx.buf_compute_metal; | ||||
|         params.compute_type = compute_type; | ||||
|         ctx_metal = ggml_init(params); | ||||
|     } | ||||
| #endif | ||||
|  | ||||
|     // TODO: clean this | ||||
|     struct ggml_context * ctx_i      = nullptr; | ||||
|     struct ggml_context * ctx_ls[80] = {nullptr}; | ||||
|     struct ggml_context * ctx_o      = nullptr; | ||||
|     struct ggml_context * ctx_kv     = nullptr; | ||||
|     struct ggml_context * ctx_ls[80] = {nullptr}; | ||||
|  | ||||
|     if (lctx.model.backend_inp == &lctx.model.backend_cpu) ctx_i = ctx_cpu; | ||||
|     if (lctx.model.backend_out == &lctx.model.backend_cpu) ctx_o = ctx_cpu; | ||||
|  | ||||
| #ifdef GGML_USE_CUDA | ||||
|     if (lctx.model.backend_inp == &lctx.model.backend_cuda) ctx_i = ctx_cuda; | ||||
|     if (lctx.model.backend_out == &lctx.model.backend_cuda) ctx_o = ctx_cuda; | ||||
| #endif | ||||
| #ifdef GGML_USE_METAL | ||||
|     if (lctx.model.backend_inp == &lctx.model.backend_metal) ctx_i = ctx_metal; | ||||
|     if (lctx.model.backend_out == &lctx.model.backend_metal) ctx_o = ctx_metal; | ||||
| #endif | ||||
|  | ||||
|     for (int il = 0; il < n_layer; il++) { | ||||
|         if (lctx.model.backend_layers[il] == &lctx.model.backend_cpu)  ctx_ls[il] = ctx_cpu; | ||||
|         if (lctx.model.backend_layers[il] == &lctx.model.backend_cpu) ctx_ls[il] = ctx_cpu; | ||||
|  | ||||
| #ifdef GGML_USE_CUDA | ||||
|         if (lctx.model.backend_layers[il] == &lctx.model.backend_cuda) ctx_ls[il] = ctx_cuda; | ||||
| #endif | ||||
| #ifdef GGML_USE_METAL | ||||
|         if (lctx.model.backend_layers[il] == &lctx.model.backend_metal) ctx_ls[il] = ctx_metal; | ||||
| #endif | ||||
|     } | ||||
|  | ||||
|     if (lctx.backend_kv == &lctx.model.backend_cpu)  ctx_kv = ctx_cpu; | ||||
|     if (lctx.backend_kv == &lctx.model.backend_cpu) ctx_kv = ctx_cpu; | ||||
|  | ||||
| #ifdef GGML_USE_CUDA | ||||
|     if (lctx.backend_kv == &lctx.model.backend_cuda) ctx_kv = ctx_cuda; | ||||
| #endif | ||||
| #ifdef GGML_USE_METAL | ||||
|     if (lctx.backend_kv == &lctx.model.backend_metal) ctx_kv = ctx_metal; | ||||
| #endif | ||||
|  | ||||
|     struct ggml_tensor * inpL; | ||||
|  | ||||
| @@ -1522,7 +1551,7 @@ static ggml_graph_splits llama_build_graph( | ||||
|     //} | ||||
|  | ||||
| #ifdef LLAMA_1L_GRAPH_DUMP | ||||
|     if (N==1 && n_past == 0) { | ||||
|     if (N == 1 && n_past == 0) { | ||||
|         ggml_graph_dump_dot(gf, NULL, "llama.dot"); | ||||
|         printf("graph for N=%i, n_past=%i dumped to llama.dot\n", N, n_past); | ||||
|         exit(0); | ||||
| @@ -1547,6 +1576,11 @@ static ggml_graph_splits llama_build_graph( | ||||
|         ggml_free(ctx_cuda); | ||||
|     } | ||||
| #endif | ||||
| #ifdef GGML_USE_METAL | ||||
|     if (ctx_metal != nullptr) { | ||||
|         ggml_free(ctx_metal); | ||||
|     } | ||||
| #endif | ||||
|  | ||||
|     return splits; | ||||
| } | ||||
| @@ -2651,7 +2685,6 @@ struct llama_context * llama_new_context_with_model( | ||||
|     ctx->rng = std::mt19937(params.seed); | ||||
|     ctx->logits_all = params.logits_all; | ||||
|  | ||||
|  | ||||
|     // TODO: choose backend depending on n_layers/low_vram | ||||
| #ifdef GGML_USE_CUDA | ||||
|     if ((uint32_t)params.n_gpu_layers >= model->hparams.n_layer/2) { | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov