mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	llama : add ability to cancel model loading (#4462)
* llama : Add ability to cancel model load Updated llama_progress_callback so that if it returns false, the model loading is aborted. * llama : Add test for model load cancellation * Fix bool return in llama_model_load, remove std::ignore use * Update llama.cpp Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com> * Fail test if model file is missing * Revert "Fail test if model file is missing" This reverts commit32ebd525bf. * Add test-model-load-cancel to Makefile * Revert "Revert "Fail test if model file is missing"" This reverts commit2796953257. * Simplify .gitignore for tests, clang-tidy fixes * Label all ctest tests * ci : ctest uses -L main * Attempt at writing ctest_with_model * ci : get ci/run.sh working with test-model-load-cancel * ci : restrict .github/workflows/build.yml ctest to -L main * update requirements.txt * Disable test-model-load-cancel in make * Remove venv before creation * Restructure requirements.txt Top-level now imports the specific additional requirements for each python file. Using `pip install -r requirements.txt` will fail if versions become mismatched in the per-file requirements. * Make per-python-script requirements work alone This doesn't break the main requirements.txt. * Add comment * Add convert-persimmon-to-gguf.py to new requirements.txt scheme * Add check-requirements.sh script and GitHub workflow * Remove shellcheck installation step from workflow * Add nocleanup special arg * Fix merge see: https://github.com/ggerganov/llama.cpp/pull/4462#discussion_r1434593573 * reset to upstream/master * Redo changes for cancelling model load --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>
This commit is contained in:
		
							
								
								
									
										46
									
								
								llama.cpp
									
									
									
									
									
								
							
							
						
						
									
										46
									
								
								llama.cpp
									
									
									
									
									
								
							| @@ -2372,7 +2372,8 @@ struct llama_model_loader { | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     void load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) const { | ||||
|     // Returns false if cancelled by progress_callback | ||||
|     bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) const { | ||||
|         size_t size_data = 0; | ||||
|  | ||||
|         for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) { | ||||
| @@ -2404,7 +2405,9 @@ struct llama_model_loader { | ||||
|             GGML_ASSERT(cur); // unused tensors should have been caught by load_data already | ||||
|  | ||||
|             if (progress_callback) { | ||||
|                 progress_callback((float) size_done / size_data, progress_callback_user_data); | ||||
|                 if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) { | ||||
|                     return false; | ||||
|                 } | ||||
|             } | ||||
|  | ||||
|             const size_t offs = file_offset(ggml_get_name(cur)); | ||||
| @@ -2466,8 +2469,11 @@ struct llama_model_loader { | ||||
|         } | ||||
|  | ||||
|         if (progress_callback) { | ||||
|             progress_callback(1.0f, progress_callback_user_data); | ||||
|             // Even though the model is done loading, we still honor | ||||
|             // cancellation since we need to free allocations. | ||||
|             return progress_callback(1.0f, progress_callback_user_data); | ||||
|         } | ||||
|         return true; | ||||
|     } | ||||
| }; | ||||
|  | ||||
| @@ -3044,7 +3050,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { | ||||
|     if (vocab.linefeed_id    != -1) { LLAMA_LOG_INFO( "%s: LF token         = %d '%s'\n", __func__, vocab.linefeed_id,    vocab.id_to_token[vocab.linefeed_id].text.c_str() );    } | ||||
| } | ||||
|  | ||||
| static void llm_load_tensors( | ||||
| // Returns false if cancelled by progress_callback | ||||
| static bool llm_load_tensors( | ||||
|         llama_model_loader & ml, | ||||
|         llama_model & model, | ||||
|         int n_gpu_layers, | ||||
| @@ -3722,16 +3729,20 @@ static void llm_load_tensors( | ||||
|         model.tensors_by_name.emplace_back(ggml_get_name(cur), cur); | ||||
|     } | ||||
|  | ||||
|     ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL); | ||||
|     if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL)) { | ||||
|         return false; | ||||
|     } | ||||
|  | ||||
|     model.mapping = std::move(ml.mapping); | ||||
|  | ||||
|     // loading time will be recalculate after the first eval, so | ||||
|     // we take page faults deferred by mmap() into consideration | ||||
|     model.t_load_us = ggml_time_us() - model.t_start_us; | ||||
|     return true; | ||||
| } | ||||
|  | ||||
| static bool llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) { | ||||
| // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback | ||||
| static int llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) { | ||||
|     try { | ||||
|         llama_model_loader ml(fname, params.use_mmap, params.kv_overrides); | ||||
|  | ||||
| @@ -3749,19 +3760,21 @@ static bool llama_model_load(const std::string & fname, llama_model & model, con | ||||
|  | ||||
|         if (params.vocab_only) { | ||||
|             LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__); | ||||
|             return true; | ||||
|             return 0; | ||||
|         } | ||||
|  | ||||
|         llm_load_tensors( | ||||
|         if (!llm_load_tensors( | ||||
|             ml, model, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.use_mlock, | ||||
|             params.progress_callback, params.progress_callback_user_data | ||||
|         ); | ||||
|         )) { | ||||
|             return -2; | ||||
|         } | ||||
|     } catch (const std::exception & err) { | ||||
|         LLAMA_LOG_ERROR("error loading model: %s\n", err.what()); | ||||
|         return false; | ||||
|         return -1; | ||||
|     } | ||||
|  | ||||
|     return true; | ||||
|     return 0; | ||||
| } | ||||
|  | ||||
| // | ||||
| @@ -9141,11 +9154,18 @@ struct llama_model * llama_load_model_from_file( | ||||
|                     LLAMA_LOG_INFO("\n"); | ||||
|                 } | ||||
|             } | ||||
|             return true; | ||||
|         }; | ||||
|     } | ||||
|  | ||||
|     if (!llama_model_load(path_model, *model, params)) { | ||||
|         LLAMA_LOG_ERROR("%s: failed to load model\n", __func__); | ||||
|     int status = llama_model_load(path_model, *model, params); | ||||
|     GGML_ASSERT(status <= 0); | ||||
|     if (status < 0) { | ||||
|         if (status == -1) { | ||||
|             LLAMA_LOG_ERROR("%s: failed to load model\n", __func__); | ||||
|         } else if (status == -2) { | ||||
|             LLAMA_LOG_INFO("%s: cancelled model load\n", __func__); | ||||
|         } | ||||
|         delete model; | ||||
|         return nullptr; | ||||
|     } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 crasm
					crasm