mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	| @@ -22,7 +22,7 @@ static void zeros(std::ofstream & file, size_t n) { | |||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| struct quantize_state_internal { | struct quantize_state_impl { | ||||||
|     const llama_model                 & model; |     const llama_model                 & model; | ||||||
|     const llama_model_quantize_params * params; |     const llama_model_quantize_params * params; | ||||||
|  |  | ||||||
| @@ -43,13 +43,13 @@ struct quantize_state_internal { | |||||||
|     // used to figure out if a model shares tok_embd with the output weight |     // used to figure out if a model shares tok_embd with the output weight | ||||||
|     bool has_output = false; |     bool has_output = false; | ||||||
|  |  | ||||||
|     quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params) |     quantize_state_impl(const llama_model & model, const llama_model_quantize_params * params) | ||||||
|         : model(model) |         : model(model) | ||||||
|         , params(params) |         , params(params) | ||||||
|         {} |         {} | ||||||
| }; | }; | ||||||
|  |  | ||||||
| static void llama_tensor_dequantize_internal( | static void llama_tensor_dequantize_impl( | ||||||
|     struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers, |     struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers, | ||||||
|     const size_t nelements, const int nthread |     const size_t nelements, const int nthread | ||||||
| ) { | ) { | ||||||
| @@ -121,7 +121,7 @@ static void llama_tensor_dequantize_internal( | |||||||
|     workers.clear(); |     workers.clear(); | ||||||
| } | } | ||||||
|  |  | ||||||
| static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) { | static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype) { | ||||||
|     const std::string name = ggml_get_name(tensor); |     const std::string name = ggml_get_name(tensor); | ||||||
|  |  | ||||||
|     // TODO: avoid hardcoded tensor names - use the TN_* constants |     // TODO: avoid hardcoded tensor names - use the TN_* constants | ||||||
| @@ -410,7 +410,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n | |||||||
|     return new_type; |     return new_type; | ||||||
| } | } | ||||||
|  |  | ||||||
| static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) { | static size_t llama_tensor_quantize_impl(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector<std::thread> & workers, const int nthread) { | ||||||
|     if (nthread < 2) { |     if (nthread < 2) { | ||||||
|         // single-thread |         // single-thread | ||||||
|         size_t new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix); |         size_t new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix); | ||||||
| @@ -464,7 +464,7 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa | |||||||
|     return new_size; |     return new_size; | ||||||
| } | } | ||||||
|  |  | ||||||
| static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) { | static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) { | ||||||
|     ggml_type default_type; |     ggml_type default_type; | ||||||
|     llama_ftype ftype = params->ftype; |     llama_ftype ftype = params->ftype; | ||||||
|  |  | ||||||
| @@ -534,7 +534,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s | |||||||
|     llm_load_hparams(ml, model); |     llm_load_hparams(ml, model); | ||||||
|     llm_load_stats  (ml, model); |     llm_load_stats  (ml, model); | ||||||
|  |  | ||||||
|     struct quantize_state_internal qs(model, params); |     struct quantize_state_impl qs(model, params); | ||||||
|  |  | ||||||
|     if (params->only_copy) { |     if (params->only_copy) { | ||||||
|         ftype = model.ftype; |         ftype = model.ftype; | ||||||
| @@ -837,7 +837,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s | |||||||
|             } else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) { |             } else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) { | ||||||
|                 throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type))); |                 throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type))); | ||||||
|             } else { |             } else { | ||||||
|                 llama_tensor_dequantize_internal(tensor, f32_conv_buf, workers, nelements, nthread); |                 llama_tensor_dequantize_impl(tensor, f32_conv_buf, workers, nelements, nthread); | ||||||
|                 f32_data = (float *) f32_conv_buf.data(); |                 f32_data = (float *) f32_conv_buf.data(); | ||||||
|             } |             } | ||||||
|  |  | ||||||
| @@ -866,7 +866,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s | |||||||
|                 void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows; |                 void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows; | ||||||
|                 const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr; |                 const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr; | ||||||
|  |  | ||||||
|                 new_size += llama_tensor_quantize_internal(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use); |                 new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use); | ||||||
|             } |             } | ||||||
|             LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0); |             LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0); | ||||||
|         } |         } | ||||||
| @@ -919,7 +919,7 @@ uint32_t llama_model_quantize( | |||||||
|         const char * fname_out, |         const char * fname_out, | ||||||
|         const llama_model_quantize_params * params) { |         const llama_model_quantize_params * params) { | ||||||
|     try { |     try { | ||||||
|         llama_model_quantize_internal(fname_inp, fname_out, params); |         llama_model_quantize_impl(fname_inp, fname_out, params); | ||||||
|     } catch (const std::exception & err) { |     } catch (const std::exception & err) { | ||||||
|         LLAMA_LOG_ERROR("%s: failed to quantize: %s\n", __func__, err.what()); |         LLAMA_LOG_ERROR("%s: failed to quantize: %s\n", __func__, err.what()); | ||||||
|         return 1; |         return 1; | ||||||
|   | |||||||
| @@ -10717,7 +10717,7 @@ static enum ggml_status llama_graph_compute( | |||||||
| // return positive int on warning | // return positive int on warning | ||||||
| // return negative int on error | // return negative int on error | ||||||
| // | // | ||||||
| static int llama_decode_internal( | static int llama_decode_impl( | ||||||
|          llama_context & lctx, |          llama_context & lctx, | ||||||
|            llama_batch   inp_batch) { |            llama_batch   inp_batch) { | ||||||
|  |  | ||||||
| @@ -11052,7 +11052,7 @@ static int llama_decode_internal( | |||||||
| // return positive int on warning | // return positive int on warning | ||||||
| // return negative int on error | // return negative int on error | ||||||
| // | // | ||||||
| static int llama_encode_internal( | static int llama_encode_impl( | ||||||
|          llama_context & lctx, |          llama_context & lctx, | ||||||
|            llama_batch   inp_batch) { |            llama_batch   inp_batch) { | ||||||
|  |  | ||||||
| @@ -11234,7 +11234,7 @@ static int llama_encode_internal( | |||||||
| } | } | ||||||
|  |  | ||||||
| // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache | // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache | ||||||
| static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { | static void llama_kv_cache_defrag_impl(struct llama_context & lctx) { | ||||||
|     auto & kv_self = lctx.kv_self; |     auto & kv_self = lctx.kv_self; | ||||||
|  |  | ||||||
|     const auto & hparams = lctx.model.hparams; |     const auto & hparams = lctx.model.hparams; | ||||||
| @@ -11454,7 +11454,7 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { | |||||||
|     //LLAMA_LOG_INFO("(tmp log) KV defrag time: %.3f ms\n", (t_end - t_start)/1000.0); |     //LLAMA_LOG_INFO("(tmp log) KV defrag time: %.3f ms\n", (t_end - t_start)/1000.0); | ||||||
| } | } | ||||||
|  |  | ||||||
| static void llama_kv_cache_update_internal(struct llama_context & lctx) { | static void llama_kv_cache_update_impl(struct llama_context & lctx) { | ||||||
|     bool need_reserve = false; |     bool need_reserve = false; | ||||||
|  |  | ||||||
|     if (lctx.kv_self.has_shift) { |     if (lctx.kv_self.has_shift) { | ||||||
| @@ -11490,7 +11490,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) { | |||||||
|  |  | ||||||
|     // defragment the KV cache if needed |     // defragment the KV cache if needed | ||||||
|     if (lctx.kv_self.do_defrag) { |     if (lctx.kv_self.do_defrag) { | ||||||
|         llama_kv_cache_defrag_internal(lctx); |         llama_kv_cache_defrag_impl(lctx); | ||||||
|  |  | ||||||
|         need_reserve = true; |         need_reserve = true; | ||||||
|  |  | ||||||
| @@ -12191,7 +12191,7 @@ void llama_kv_cache_defrag(struct llama_context * ctx) { | |||||||
| } | } | ||||||
|  |  | ||||||
| void llama_kv_cache_update(struct llama_context * ctx) { | void llama_kv_cache_update(struct llama_context * ctx) { | ||||||
|     llama_kv_cache_update_internal(*ctx); |     llama_kv_cache_update_impl(*ctx); | ||||||
| } | } | ||||||
|  |  | ||||||
| bool llama_kv_cache_can_shift(struct llama_context * ctx) { | bool llama_kv_cache_can_shift(struct llama_context * ctx) { | ||||||
| @@ -12203,7 +12203,7 @@ bool llama_kv_cache_can_shift(struct llama_context * ctx) { | |||||||
| int32_t llama_encode( | int32_t llama_encode( | ||||||
|         struct llama_context * ctx, |         struct llama_context * ctx, | ||||||
|           struct llama_batch   batch) { |           struct llama_batch   batch) { | ||||||
|     const int ret = llama_encode_internal(*ctx, batch); |     const int ret = llama_encode_impl(*ctx, batch); | ||||||
|     if (ret != 0) { |     if (ret != 0) { | ||||||
|         LLAMA_LOG_ERROR("%s: failed to encode, ret = %d\n", __func__, ret); |         LLAMA_LOG_ERROR("%s: failed to encode, ret = %d\n", __func__, ret); | ||||||
|     } |     } | ||||||
| @@ -12214,7 +12214,7 @@ int32_t llama_encode( | |||||||
| int32_t llama_decode( | int32_t llama_decode( | ||||||
|         struct llama_context * ctx, |         struct llama_context * ctx, | ||||||
|           struct llama_batch   batch) { |           struct llama_batch   batch) { | ||||||
|     const int ret = llama_decode_internal(*ctx, batch); |     const int ret = llama_decode_impl(*ctx, batch); | ||||||
|     if (ret != 0) { |     if (ret != 0) { | ||||||
|         LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret); |         LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret); | ||||||
|     } |     } | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov