mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	llama : quantize up to 31% faster on Linux and Windows with mmap (#3206)
* llama : enable mmap in quantize on Linux -> 31% faster * also enable mmap on Windows --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
		
							
								
								
									
										21
									
								
								llama.cpp
									
									
									
									
									
								
							
							
						
						
									
										21
									
								
								llama.cpp
									
									
									
									
									
								
							| @@ -6027,7 +6027,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s | ||||
|         nthread = std::thread::hardware_concurrency(); | ||||
|     } | ||||
|  | ||||
|     llama_model_loader ml(fname_inp, /*use_mmap*/ false); | ||||
|     // mmap consistently increases speed Linux, and also increases speed on Windows with | ||||
|     // hot cache. It may cause a slowdown on macOS, possibly related to free memory. | ||||
| #if defined(__linux__) || defined(_WIN32) | ||||
|     constexpr bool use_mmap = true; | ||||
| #else | ||||
|     constexpr bool use_mmap = false; | ||||
| #endif | ||||
|  | ||||
|     llama_model_loader ml(fname_inp, use_mmap); | ||||
|     if (ml.use_mmap) { | ||||
|         ml.mapping.reset(new llama_mmap(&ml.file, /* prefetch */ 0, ggml_is_numa())); | ||||
|     } | ||||
|  | ||||
|     llama_model model; | ||||
|     llm_load_arch(ml, model); | ||||
| @@ -6105,10 +6116,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s | ||||
|  | ||||
|         const std::string name = ggml_get_name(tensor); | ||||
|  | ||||
|         if (read_data.size() < ggml_nbytes(tensor)) { | ||||
|             read_data.resize(ggml_nbytes(tensor)); | ||||
|         if (!ml.use_mmap) { | ||||
|             if (read_data.size() < ggml_nbytes(tensor)) { | ||||
|                 read_data.resize(ggml_nbytes(tensor)); | ||||
|             } | ||||
|             tensor->data = read_data.data(); | ||||
|         } | ||||
|         tensor->data = read_data.data(); | ||||
|         ml.load_data_for(tensor); | ||||
|  | ||||
|         LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ", | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Cebtenzzre
					Cebtenzzre