mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	ggml : use F16 instead of F32 in Q4_0, Q4_1, Q8_0 (#1508)
* ggml : use F16 instead of F32 in Q4_0, Q4_1 and Q8_0 * llama : bump LLAMA_FILE_VERSION to 3 * cuda : update Q4 and Q8 dequantize kernels * ggml : fix AVX dot products * readme : update performance table + hot topics
This commit is contained in:
		
							
								
								
									
										18
									
								
								llama.cpp
									
									
									
									
									
								
							
							
						
						
									
										18
									
								
								llama.cpp
									
									
									
									
									
								
							| @@ -406,6 +406,7 @@ enum llama_file_version { | ||||
|     LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab | ||||
|     LLAMA_FILE_VERSION_GGJT_V1, // added padding | ||||
|     LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format | ||||
|     LLAMA_FILE_VERSION_GGJT_V3, // changed Q4 and Q8 quantization format | ||||
| }; | ||||
|  | ||||
| struct llama_file_loader { | ||||
| @@ -438,6 +439,8 @@ struct llama_file_loader { | ||||
|             file_version = LLAMA_FILE_VERSION_GGJT_V1; | ||||
|         } else if (magic == 'ggjt' && version == 2) { | ||||
|             file_version = LLAMA_FILE_VERSION_GGJT_V2; | ||||
|         } else if (magic == 'ggjt' && version == 3) { | ||||
|             file_version = LLAMA_FILE_VERSION_GGJT_V3; | ||||
|         } else { | ||||
|             throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?", | ||||
|                          magic, version); | ||||
| @@ -844,7 +847,8 @@ static const char *llama_file_version_name(llama_file_version version) { | ||||
|         case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)"; | ||||
|         case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)"; | ||||
|         case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)"; | ||||
|         case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (latest)"; | ||||
|         case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (pre #1508)"; | ||||
|         case LLAMA_FILE_VERSION_GGJT_V3: return "ggjt v3 (latest)"; | ||||
|     } | ||||
|  | ||||
|     return "unknown"; | ||||
| @@ -924,11 +928,19 @@ static void llama_model_load_internal( | ||||
|         fprintf(stderr, "%s: model size = %s\n",  __func__, llama_model_type_name(model.type)); | ||||
|     } | ||||
|  | ||||
|     if (file_version != LLAMA_FILE_VERSION_GGJT_V2) { | ||||
|     if (file_version < LLAMA_FILE_VERSION_GGJT_V2) { | ||||
|         if (hparams.ftype != LLAMA_FTYPE_ALL_F32     && | ||||
|             hparams.ftype != LLAMA_FTYPE_MOSTLY_F16  && | ||||
|             hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) { | ||||
|             throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1305)"); | ||||
|             throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)"); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     if (file_version < LLAMA_FILE_VERSION_GGJT_V3) { | ||||
|         if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || | ||||
|             hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 || | ||||
|             hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) { | ||||
|             throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)"); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov