mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	llama : fix integer overflow during quantization (#4284)
happens with multi-threaded quantization of Qwen-72B ggml-ci
This commit is contained in:
		
							
								
								
									
										21
									
								
								llama.cpp
									
									
									
									
									
								
							
							
						
						
									
										21
									
								
								llama.cpp
									
									
									
									
									
								
							| @@ -7655,18 +7655,21 @@ static void llama_convert_tensor_internal( | ||||
|         return; | ||||
|     } | ||||
|  | ||||
|     auto block_size = tensor->type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor->type); | ||||
|     auto block_size_bytes = ggml_type_size(tensor->type); | ||||
|     size_t block_size = tensor->type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor->type); | ||||
|     size_t block_size_bytes = ggml_type_size(tensor->type); | ||||
|  | ||||
|     GGML_ASSERT(nelements % block_size == 0); | ||||
|     auto nblocks = nelements / block_size; | ||||
|     auto blocks_per_thread = nblocks / nthread; | ||||
|     auto spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count | ||||
|     size_t nblocks = nelements / block_size; | ||||
|     size_t blocks_per_thread = nblocks / nthread; | ||||
|     size_t spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count | ||||
|  | ||||
|     for (auto tnum = 0, in_buff_offs = 0, out_buff_offs = 0; tnum < nthread; tnum++) { | ||||
|         auto thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread | ||||
|         auto thr_elems = thr_blocks * block_size; // number of elements for this thread | ||||
|         auto thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread | ||||
|     size_t in_buff_offs = 0; | ||||
|     size_t out_buff_offs = 0; | ||||
|  | ||||
|     for (int tnum = 0; tnum < nthread; tnum++) { | ||||
|         size_t thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread | ||||
|         size_t thr_elems = thr_blocks * block_size; // number of elements for this thread | ||||
|         size_t thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread | ||||
|  | ||||
|         auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) { | ||||
|             if (typ == GGML_TYPE_F16) { | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov