mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	ggml : remove bit shuffling (#1405)
* ggml : remove Q4_0 bit shufling (ARM NEON)
* ggml : remove Q4_1 bit shuffling (ARM NEON + reference)
* ggml : nibbles_from_floats() + bytes_from_nibbles() (ARM NEON)
* ggml : remove Q4_2 bit shuffling (WIP, BROKEN)
* ggml : remove Q5_0 bit shuffling (ARM NEON)
* ggml : 2x faster scalar implementations
* ggml : remove Q5_1 bit shuffling (ARM NEON + scalar)
* ggml : simplify scalar dot
* ggml : remove WASM SIMD bit shuffling + remove vzip for ARM 32-bit
* ggml : fix Q4_1 quantization
* ggml : update cuBLAS + normalize variable names
* ggml : remove Q4_2 mode
* ggml : minor formatting
* ggml : fix Q5_0 quantization
* scripts : add script for measuring the time per token
* AVX implementations (#1370)
* ggml : uniform 5th bit extraction
* llama : produce error upon loading old model files
* llama : fix model magic/version write
* ggml : speed-up Q5_0 + Q5_1 at 4 threads
* ggml : preserve old Q4 and Q5 formats
* ggml : simplify Q8_1 - no need for low / high sums anymore
* ggml : fix Q8_0 and Q8_1 rounding
* Revert "AVX implementations (#1370)"
This reverts commit 948d124837.
* ggml : fix AVX2 implementation
* sha : update hashes for 7B and 13B
* readme : update timings + remove warning banner
* llama : update v2 PR number to 1405
* ggml : fix WASM comments
* ggml : back to original bit order
* readme : add note that Q4 and Q5 have been changed
* llama : fix return for unknown version
---------
Co-authored-by: Stephan Walter <stephan@walter.name>
			
			
This commit is contained in:
		
							
								
								
									
										29
									
								
								llama.cpp
									
									
									
									
									
								
							
							
						
						
									
										29
									
								
								llama.cpp
									
									
									
									
									
								
							| @@ -402,6 +402,7 @@ enum llama_file_version { | ||||
|     LLAMA_FILE_VERSION_GGML, | ||||
|     LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab | ||||
|     LLAMA_FILE_VERSION_GGJT_V1, // added padding | ||||
|     LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format | ||||
| }; | ||||
|  | ||||
| struct llama_file_loader { | ||||
| @@ -432,6 +433,8 @@ struct llama_file_loader { | ||||
|             file_version = LLAMA_FILE_VERSION_GGMF_V1; | ||||
|         } else if (magic == 'ggjt' && version == 1) { | ||||
|             file_version = LLAMA_FILE_VERSION_GGJT_V1; | ||||
|         } else if (magic == 'ggjt' && version == 2) { | ||||
|             file_version = LLAMA_FILE_VERSION_GGJT_V2; | ||||
|         } else { | ||||
|             throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?", | ||||
|                          magic, version); | ||||
| @@ -482,7 +485,6 @@ struct llama_file_loader { | ||||
|                 case GGML_TYPE_F16: | ||||
|                 case GGML_TYPE_Q4_0: | ||||
|                 case GGML_TYPE_Q4_1: | ||||
|                 case GGML_TYPE_Q4_2: | ||||
|                 case GGML_TYPE_Q5_0: | ||||
|                 case GGML_TYPE_Q5_1: | ||||
|                 case GGML_TYPE_Q8_0: | ||||
| @@ -527,8 +529,8 @@ struct llama_file_saver { | ||||
|         write_vocab(); | ||||
|     } | ||||
|     void write_magic() { | ||||
|         file.write_u32('ggjt'); // magic | ||||
|         file.write_u32(1); // version | ||||
|         file.write_u32(LLAMA_FILE_MAGIC);   // magic | ||||
|         file.write_u32(LLAMA_FILE_VERSION); // version | ||||
|     } | ||||
|     void write_hparams(enum llama_ftype new_ftype) { | ||||
|         const llama_hparams & hparams = any_file_loader->hparams; | ||||
| @@ -558,7 +560,6 @@ struct llama_file_saver { | ||||
|             case GGML_TYPE_F16: | ||||
|             case GGML_TYPE_Q4_0: | ||||
|             case GGML_TYPE_Q4_1: | ||||
|             case GGML_TYPE_Q4_2: | ||||
|             case GGML_TYPE_Q5_0: | ||||
|             case GGML_TYPE_Q5_1: | ||||
|             case GGML_TYPE_Q8_0: | ||||
| @@ -839,9 +840,11 @@ static const char *llama_file_version_name(llama_file_version version) { | ||||
|     switch (version) { | ||||
|         case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)"; | ||||
|         case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)"; | ||||
|         case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (latest)"; | ||||
|         default: LLAMA_ASSERT(false); | ||||
|         case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)"; | ||||
|         case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (latest)"; | ||||
|     } | ||||
|  | ||||
|     return "unknown"; | ||||
| } | ||||
|  | ||||
| static const char *llama_ftype_name(enum llama_ftype ftype) { | ||||
| @@ -852,7 +855,6 @@ static const char *llama_ftype_name(enum llama_ftype ftype) { | ||||
|         case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1"; | ||||
|         case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: | ||||
|                                       return "mostly Q4_1, some F16"; | ||||
|         case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2"; | ||||
|         case LLAMA_FTYPE_MOSTLY_Q5_0: return "mostly Q5_0"; | ||||
|         case LLAMA_FTYPE_MOSTLY_Q5_1: return "mostly Q5_1"; | ||||
|         case LLAMA_FTYPE_MOSTLY_Q8_0: return "mostly Q8_0"; | ||||
| @@ -918,6 +920,14 @@ static void llama_model_load_internal( | ||||
|         fprintf(stderr, "%s: model size = %s\n",  __func__, llama_model_type_name(model.type)); | ||||
|     } | ||||
|  | ||||
|     if (file_version != LLAMA_FILE_VERSION_GGJT_V2) { | ||||
|         if (hparams.ftype != LLAMA_FTYPE_ALL_F32     && | ||||
|             hparams.ftype != LLAMA_FTYPE_MOSTLY_F16  && | ||||
|             hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) { | ||||
|             throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1305)"); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     if (vocab_only) { | ||||
|         return; | ||||
|     } | ||||
| @@ -1905,7 +1915,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s | ||||
|     switch (ftype) { | ||||
|         case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break; | ||||
|         case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break; | ||||
|         case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break; | ||||
|         case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break; | ||||
|         case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break; | ||||
|         case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break; | ||||
| @@ -2813,9 +2822,9 @@ void llama_print_timings(struct llama_context * ctx) { | ||||
|  | ||||
|     fprintf(stderr, "\n"); | ||||
|     fprintf(stderr, "%s:        load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0); | ||||
|     fprintf(stderr, "%s:      sample time = %8.2f ms / %5d runs   (%8.2f ms per run)\n",   __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample); | ||||
|     fprintf(stderr, "%s:      sample time = %8.2f ms / %5d runs   (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample); | ||||
|     fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval); | ||||
|     fprintf(stderr, "%s:        eval time = %8.2f ms / %5d runs   (%8.2f ms per run)\n",   __func__, 1e-3 * ctx->t_eval_us,   n_eval,   1e-3 * ctx->t_eval_us   / n_eval); | ||||
|     fprintf(stderr, "%s:        eval time = %8.2f ms / %5d runs   (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_eval_us,   n_eval,   1e-3 * ctx->t_eval_us   / n_eval); | ||||
|     fprintf(stderr, "%s:       total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0); | ||||
| } | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov