mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	llama : quantize up to 31% faster on Linux and Windows with mmap (#3206)
* llama : enable mmap in quantize on Linux -> 31% faster * also enable mmap on Windows --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
		
							
								
								
									
										15
									
								
								llama.cpp
									
									
									
									
									
								
							
							
						
						
									
										15
									
								
								llama.cpp
									
									
									
									
									
								
							| @@ -6027,7 +6027,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s | |||||||
|         nthread = std::thread::hardware_concurrency(); |         nthread = std::thread::hardware_concurrency(); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     llama_model_loader ml(fname_inp, /*use_mmap*/ false); |     // mmap consistently increases speed Linux, and also increases speed on Windows with | ||||||
|  |     // hot cache. It may cause a slowdown on macOS, possibly related to free memory. | ||||||
|  | #if defined(__linux__) || defined(_WIN32) | ||||||
|  |     constexpr bool use_mmap = true; | ||||||
|  | #else | ||||||
|  |     constexpr bool use_mmap = false; | ||||||
|  | #endif | ||||||
|  |  | ||||||
|  |     llama_model_loader ml(fname_inp, use_mmap); | ||||||
|  |     if (ml.use_mmap) { | ||||||
|  |         ml.mapping.reset(new llama_mmap(&ml.file, /* prefetch */ 0, ggml_is_numa())); | ||||||
|  |     } | ||||||
|  |  | ||||||
|     llama_model model; |     llama_model model; | ||||||
|     llm_load_arch(ml, model); |     llm_load_arch(ml, model); | ||||||
| @@ -6105,10 +6116,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s | |||||||
|  |  | ||||||
|         const std::string name = ggml_get_name(tensor); |         const std::string name = ggml_get_name(tensor); | ||||||
|  |  | ||||||
|  |         if (!ml.use_mmap) { | ||||||
|             if (read_data.size() < ggml_nbytes(tensor)) { |             if (read_data.size() < ggml_nbytes(tensor)) { | ||||||
|                 read_data.resize(ggml_nbytes(tensor)); |                 read_data.resize(ggml_nbytes(tensor)); | ||||||
|             } |             } | ||||||
|             tensor->data = read_data.data(); |             tensor->data = read_data.data(); | ||||||
|  |         } | ||||||
|         ml.load_data_for(tensor); |         ml.load_data_for(tensor); | ||||||
|  |  | ||||||
|         LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ", |         LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ", | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Cebtenzzre
					Cebtenzzre