mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	imatrix : migrate to gpt_params (#7771)
* imatrix : migrate to gpt_params ggml-ci * imatrix : add --save-frequency cli arg * common : fix --no-ppl
This commit is contained in:
		| @@ -273,6 +273,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { | ||||
|         } | ||||
|     } catch (const std::invalid_argument & ex) { | ||||
|         fprintf(stderr, "%s\n", ex.what()); | ||||
|         params = params_org; | ||||
|         return false; | ||||
|     } | ||||
|  | ||||
| @@ -408,6 +409,20 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa | ||||
|         } | ||||
|         return true; | ||||
|     } | ||||
|     if (arg == "--in-file") { | ||||
|         if (++i >= argc) { | ||||
|             invalid_param = true; | ||||
|             return true; | ||||
|         } | ||||
|         std::ifstream file(argv[i]); | ||||
|         if (!file) { | ||||
|             fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); | ||||
|             invalid_param = true; | ||||
|             return true; | ||||
|         } | ||||
|         params.in_files.push_back(argv[i]); | ||||
|         return true; | ||||
|     } | ||||
|     if (arg == "-n" || arg == "--predict" || arg == "--n-predict") { | ||||
|         if (++i >= argc) { | ||||
|             invalid_param = true; | ||||
| @@ -1081,7 +1096,15 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa | ||||
|         return true; | ||||
|     } | ||||
|     if (arg == "-v" || arg == "--verbose") { | ||||
|         params.verbose = true; | ||||
|         params.verbosity = 1; | ||||
|         return true; | ||||
|     } | ||||
|     if (arg == "--verbosity") { | ||||
|         if (++i >= argc) { | ||||
|             invalid_param = true; | ||||
|             return true; | ||||
|         } | ||||
|         params.verbosity = std::stoi(argv[i]); | ||||
|         return true; | ||||
|     } | ||||
|     if (arg == "--verbose-prompt") { | ||||
| @@ -1537,6 +1560,46 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa | ||||
|         params.i_pos = std::stoi(argv[i]); | ||||
|         return true; | ||||
|     } | ||||
|     if (arg == "-o" || arg == "--output" || arg == "--output-file") { | ||||
|         if (++i >= argc) { | ||||
|             invalid_param = true; | ||||
|             return true; | ||||
|         } | ||||
|         params.out_file = argv[i]; | ||||
|         return true; | ||||
|     } | ||||
|     if (arg == "-ofreq" || arg == "--output-frequency") { | ||||
|         if (++i >= argc) { | ||||
|             invalid_param = true; | ||||
|             return true; | ||||
|         } | ||||
|         params.n_out_freq = std::stoi(argv[i]); | ||||
|         return true; | ||||
|     } | ||||
|     if (arg == "--save-frequency") { | ||||
|         if (++i >= argc) { | ||||
|             invalid_param = true; | ||||
|             return true; | ||||
|         } | ||||
|         params.n_save_freq = std::stoi(argv[i]); | ||||
|         return true; | ||||
|     } | ||||
|     if (arg == "--process-output") { | ||||
|         params.process_output = true; | ||||
|         return true; | ||||
|     } | ||||
|     if (arg == "--no-ppl") { | ||||
|         params.compute_ppl = false; | ||||
|         return true; | ||||
|     } | ||||
|     if (arg == "--chunk" || arg == "--from-chunk") { | ||||
|         if (++i >= argc) { | ||||
|             invalid_param = true; | ||||
|             return true; | ||||
|         } | ||||
|         params.i_chunk = std::stoi(argv[i]); | ||||
|         return true; | ||||
|     } | ||||
| #ifndef LOG_DISABLE_LOGS | ||||
|     // Parse args for logging parameters | ||||
|     if (log_param_single_parse(argv[i])) { | ||||
| @@ -1612,6 +1675,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param | ||||
|     options.push_back({ "*",           "-h,    --help, --usage",        "print usage and exit" }); | ||||
|     options.push_back({ "*",           "       --version",              "show version and build info" }); | ||||
|     options.push_back({ "*",           "-v,    --verbose",              "print verbose information" }); | ||||
|     options.push_back({ "*",           "       --verbosity N",          "set specific verbosity level (default: %d)", params.verbosity }); | ||||
|     options.push_back({ "*",           "       --verbose-prompt",       "print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false" }); | ||||
|     options.push_back({ "*",           "       --no-display-prompt",    "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" }); | ||||
|     options.push_back({ "*",           "-co,   --color",                "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" }); | ||||
| @@ -1637,6 +1701,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param | ||||
|     options.push_back({ "*",           "-fa,   --flash-attn",           "enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled" }); | ||||
|     options.push_back({ "*",           "-p,    --prompt PROMPT",        "prompt to start generation with (default: '%s')", params.prompt.c_str() }); | ||||
|     options.push_back({ "*",           "-f,    --file FNAME",           "a file containing the prompt (default: none)" }); | ||||
|     options.push_back({ "*",           "       --in-file FNAME",        "an input file (repeat to specify multiple files)" }); | ||||
|     options.push_back({ "*",           "-bf,   --binary-file FNAME",    "binary file containing the prompt (default: none)" }); | ||||
|     options.push_back({ "*",           "-e,    --escape",               "process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false" }); | ||||
|     options.push_back({ "*",           "       --no-escape",            "do not process escape sequences" }); | ||||
| @@ -1804,6 +1869,14 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param | ||||
|     options.push_back({ "passkey",     "       --junk N",               "number of times to repeat the junk text (default: %d)", params.n_junk }); | ||||
|     options.push_back({ "passkey",     "       --pos N",                "position of the passkey in the junk text (default: %d)", params.i_pos }); | ||||
|  | ||||
|     options.push_back({ "imatrix" }); | ||||
|     options.push_back({ "imatrix",     "-o,    --output FNAME",         "output file (default: '%s')", params.out_file.c_str() }); | ||||
|     options.push_back({ "imatrix",     "       --output-frequency N",   "output the imatrix every N iterations (default: %d)", params.n_out_freq }); | ||||
|     options.push_back({ "imatrix",     "       --save-frequency N",     "save an imatrix copy every N iterations (default: %d)", params.n_save_freq }); | ||||
|     options.push_back({ "imatrix",     "       --process-output",       "collect data for the output tensor (default: %s)", params.process_output ? "true" : "false" }); | ||||
|     options.push_back({ "imatrix",     "       --no-ppl",               "do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false" }); | ||||
|     options.push_back({ "imatrix",     "       --chunk N",              "start processing the input from chunk N (default: %d)", params.i_chunk }); | ||||
|  | ||||
|     options.push_back({ "bench" }); | ||||
|     options.push_back({ "bench",       "-pps",                          "is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false" }); | ||||
|     options.push_back({ "bench",       "-npp n0,n1,...",                "number of prompt tokens" }); | ||||
|   | ||||
| @@ -56,43 +56,42 @@ struct gpt_params { | ||||
|     uint32_t seed                 = LLAMA_DEFAULT_SEED; // RNG seed | ||||
|  | ||||
|     int32_t n_threads             = cpu_get_num_math(); | ||||
|     int32_t n_threads_draft       = -1; | ||||
|     int32_t n_threads_batch       = -1;    // number of threads to use for batch processing (-1 = use n_threads) | ||||
|     int32_t n_threads_batch_draft = -1; | ||||
|     int32_t n_predict             = -1;    // new tokens to predict | ||||
|     int32_t n_ctx                 = 0;     // context size | ||||
|     int32_t n_batch               = 2048;  // logical batch size for prompt processing (must be >=32 to use BLAS) | ||||
|     int32_t n_ubatch              = 512;   // physical batch size for prompt processing (must be >=32 to use BLAS) | ||||
|     int32_t n_keep                = 0;     // number of tokens to keep from initial prompt | ||||
|     int32_t n_draft               = 5;     // number of tokens to draft during speculative decoding | ||||
|     int32_t n_chunks              = -1;    // max number of chunks to process (-1 = unlimited) | ||||
|     int32_t n_parallel            = 1;     // number of parallel sequences to decode | ||||
|     int32_t n_sequences           = 1;     // number of sequences to decode | ||||
|     float   p_split               = 0.1f;  // speculative decoding split probability | ||||
|     int32_t n_gpu_layers          = -1;    // number of layers to store in VRAM (-1 - use default) | ||||
|     int32_t n_gpu_layers_draft    = -1;    // number of layers to store in VRAM for the draft model (-1 - use default) | ||||
|     llama_split_mode split_mode   = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs | ||||
|     int32_t main_gpu              = 0;     // the GPU that is used for scratch and small tensors | ||||
|     float   tensor_split[128]     = {0};   // how split tensors should be distributed across GPUs | ||||
|     int32_t n_beams               = 0;     // if non-zero then use beam search of given width. | ||||
|     int32_t grp_attn_n            = 1;     // group-attention factor | ||||
|     int32_t grp_attn_w            = 512;   // group-attention width | ||||
|     int32_t n_print               = -1;    // print token count every n tokens (-1 = disabled) | ||||
|     float   rope_freq_base        = 0.0f;  // RoPE base frequency | ||||
|     float   rope_freq_scale       = 0.0f;  // RoPE frequency scaling factor | ||||
|     int32_t n_threads_draft       =    -1; | ||||
|     int32_t n_threads_batch       =    -1; // number of threads to use for batch processing (-1 = use n_threads) | ||||
|     int32_t n_threads_batch_draft =    -1; | ||||
|     int32_t n_predict             =    -1; // new tokens to predict | ||||
|     int32_t n_ctx                 =     0; // context size | ||||
|     int32_t n_batch               =  2048; // logical batch size for prompt processing (must be >=32 to use BLAS) | ||||
|     int32_t n_ubatch              =   512; // physical batch size for prompt processing (must be >=32 to use BLAS) | ||||
|     int32_t n_keep                =     0; // number of tokens to keep from initial prompt | ||||
|     int32_t n_draft               =     5; // number of tokens to draft during speculative decoding | ||||
|     int32_t n_chunks              =    -1; // max number of chunks to process (-1 = unlimited) | ||||
|     int32_t n_parallel            =     1; // number of parallel sequences to decode | ||||
|     int32_t n_sequences           =     1; // number of sequences to decode | ||||
|     float   p_split               =  0.1f; // speculative decoding split probability | ||||
|     int32_t n_gpu_layers          =    -1; // number of layers to store in VRAM (-1 - use default) | ||||
|     int32_t n_gpu_layers_draft    =    -1; // number of layers to store in VRAM for the draft model (-1 - use default) | ||||
|     int32_t main_gpu              =     0; // the GPU that is used for scratch and small tensors | ||||
|     float   tensor_split[128]     =   {0}; // how split tensors should be distributed across GPUs | ||||
|     int32_t n_beams               =     0; // if non-zero then use beam search of given width. | ||||
|     int32_t grp_attn_n            =     1; // group-attention factor | ||||
|     int32_t grp_attn_w            =   512; // group-attention width | ||||
|     int32_t n_print               =    -1; // print token count every n tokens (-1 = disabled) | ||||
|     float   rope_freq_base        =  0.0f; // RoPE base frequency | ||||
|     float   rope_freq_scale       =  0.0f; // RoPE frequency scaling factor | ||||
|     float   yarn_ext_factor       = -1.0f; // YaRN extrapolation mix factor | ||||
|     float   yarn_attn_factor      = 1.0f;  // YaRN magnitude scaling factor | ||||
|     float   yarn_attn_factor      =  1.0f; // YaRN magnitude scaling factor | ||||
|     float   yarn_beta_fast        = 32.0f; // YaRN low correction dim | ||||
|     float   yarn_beta_slow        = 1.0f;  // YaRN high correction dim | ||||
|     int32_t yarn_orig_ctx         = 0;     // YaRN original context length | ||||
|     float   yarn_beta_slow        =  1.0f; // YaRN high correction dim | ||||
|     int32_t yarn_orig_ctx         =     0; // YaRN original context length | ||||
|     float   defrag_thold          = -1.0f; // KV cache defragmentation threshold | ||||
|     std::string rpc_servers       = "";    // comma separated list of RPC servers | ||||
|  | ||||
|     ggml_backend_sched_eval_callback cb_eval = nullptr; | ||||
|     void * cb_eval_user_data                 = nullptr; | ||||
|  | ||||
|     ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED; | ||||
|  | ||||
|     enum llama_split_mode        split_mode        = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs | ||||
|     enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED; | ||||
|     enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings | ||||
|  | ||||
| @@ -114,7 +113,9 @@ struct gpt_params { | ||||
|     std::string lookup_cache_static  = ""; // path of static ngram cache file for lookup decoding | ||||
|     std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding | ||||
|     std::string logits_file          = ""; // file for saving *all* logits | ||||
|     std::string rpc_servers          = ""; // comma separated list of RPC servers | ||||
|  | ||||
|     std::vector<std::string> in_files;   // all input files | ||||
|     std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts) | ||||
|     std::vector<llama_model_kv_override> kv_overrides; | ||||
|  | ||||
| @@ -124,23 +125,24 @@ struct gpt_params { | ||||
|  | ||||
|     std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale | ||||
|  | ||||
|     int32_t verbosity                  = 0; | ||||
|     int32_t control_vector_layer_start = -1; // layer range for control vector | ||||
|     int32_t control_vector_layer_end   = -1; // layer range for control vector | ||||
|  | ||||
|     int32_t ppl_stride      = 0;    // stride for perplexity calculations. If left at 0, the pre-existing approach will be used. | ||||
|     int32_t ppl_output_type = 0;    // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line | ||||
|                                     //                                       (which is more convenient to use for plotting) | ||||
|                                     // | ||||
|     bool   hellaswag       = false; // compute HellaSwag score over random tasks from datafile supplied in prompt | ||||
|     size_t hellaswag_tasks = 400;   // number of tasks to use when computing the HellaSwag score | ||||
|     int32_t ppl_stride      = 0;     // stride for perplexity calculations. If left at 0, the pre-existing approach will be used. | ||||
|     int32_t ppl_output_type = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line | ||||
|                                      //                                       (which is more convenient to use for plotting) | ||||
|                                      // | ||||
|     bool   hellaswag        = false; // compute HellaSwag score over random tasks from datafile supplied in prompt | ||||
|     size_t hellaswag_tasks  = 400;   // number of tasks to use when computing the HellaSwag score | ||||
|  | ||||
|     bool   winogrande      = false; // compute Winogrande score over random tasks from datafile supplied in prompt | ||||
|     size_t winogrande_tasks= 0;     // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed | ||||
|     bool   winogrande       = false; // compute Winogrande score over random tasks from datafile supplied in prompt | ||||
|     size_t winogrande_tasks = 0;     // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed | ||||
|  | ||||
|     bool   multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt | ||||
|     size_t multiple_choice_tasks = 0;     // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed | ||||
|     bool   multiple_choice  = false;  // compute TruthfulQA score over random tasks from datafile supplied in prompt | ||||
|     size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed | ||||
|  | ||||
|     bool   kl_divergence   = false; // compute KL divergence | ||||
|     bool   kl_divergence    = false; // compute KL divergence | ||||
|  | ||||
|     bool usage             = false; // print usage | ||||
|     bool use_color         = false; // use color to distinguish generations and inputs | ||||
| @@ -163,7 +165,6 @@ struct gpt_params { | ||||
|     bool logits_all        = false; // return logits for all tokens in the batch | ||||
|     bool use_mmap          = true;  // use mmap for faster loads | ||||
|     bool use_mlock         = false; // use mlock to keep model in memory | ||||
|     bool verbose           = false; | ||||
|     bool verbose_prompt    = false; // print prompt tokens before generation | ||||
|     bool display_prompt    = true;  // print prompt before generation | ||||
|     bool infill            = false; // use infill mode | ||||
| @@ -180,10 +181,10 @@ struct gpt_params { | ||||
|     std::vector<std::string> image; // path to image file(s) | ||||
|  | ||||
|     // server params | ||||
|     int32_t port           = 8080; | ||||
|     int32_t timeout_read   = 600; | ||||
|     int32_t timeout_write  = timeout_read; | ||||
|     int32_t n_threads_http = -1; | ||||
|     int32_t port           = 8080;         // server listens on this network port | ||||
|     int32_t timeout_read   = 600;          // http read timeout in seconds | ||||
|     int32_t timeout_write  = timeout_read; // http write timeout in seconds | ||||
|     int32_t n_threads_http = -1;           // number of threads to use for http server (-1 = use n_threads) | ||||
|  | ||||
|     std::string hostname      = "127.0.0.1"; | ||||
|     std::string public_path   = ""; | ||||
| @@ -219,6 +220,16 @@ struct gpt_params { | ||||
|     // passkey params | ||||
|     int32_t n_junk = 250; // number of times to repeat the junk text | ||||
|     int32_t i_pos  = -1;  // position of the passkey in the junk text | ||||
|  | ||||
|     // imatrix params | ||||
|     std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file | ||||
|  | ||||
|     int32_t n_out_freq  = 10; // output the imatrix every n_out_freq iterations | ||||
|     int32_t n_save_freq =  0; // save the imatrix every n_save_freq iterations | ||||
|     int32_t i_chunk     =  0; // start processing from this chunk | ||||
|  | ||||
|     bool process_output = false; // collect data for the output tensor | ||||
|     bool compute_ppl    = true;  // whether to compute perplexity | ||||
| }; | ||||
|  | ||||
| void gpt_params_handle_model_default(gpt_params & params); | ||||
|   | ||||
| @@ -6,16 +6,19 @@ More information is available here: https://github.com/ggerganov/llama.cpp/pull/ | ||||
| ## Usage | ||||
|  | ||||
| ``` | ||||
| ./imatrix -m <some_fp_model> -f <some_training_data> [-o <output_file>] [--verbosity <verbosity_level>] | ||||
|         [-ofreq num_chunks] [-ow <0 or 1>] [other common params] | ||||
| ./imatrix \ | ||||
|     -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \ | ||||
|     [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \ | ||||
|     [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...] | ||||
| ``` | ||||
|  | ||||
| Here `-m` with a model name and `-f` with a file containing training data (such as e.g. `wiki.train.raw`) are mandatory. | ||||
| The parameters in square brackets are optional and have the following meaning: | ||||
| * `-o` (or `--output-file`) specifies the name of the file where the computed data will be stored. If missing `imatrix.dat` is used. | ||||
| * `--verbosity` specifies the verbosity level. If set to `0`, no output other than the perplexity of the processed chunks will be generated. If set to `1`, each time the results are saved a message is written to `stderr`. If `>=2`, a message is output each time data is collected for any tensor. Default verbosity level is `1`. | ||||
| * `-ofreq` (or `--output-frequency`) specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks) | ||||
| * `-ow` (or `--output-weight`) specifies if data will be collected for the `output.weight` tensor. My experience is that it is better to not utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default. | ||||
| * `--output-frequency` specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks) | ||||
| * `--save-frequency` specifies how often to save a copy of the imatrix in a separate file. Default is 0 (i.e., never) | ||||
| * `--process-output` specifies if data will be collected for the `output.weight` tensor. My experience is that it is better to not utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default. | ||||
|  | ||||
| For faster computation, make sure to use GPU offloading via the `-ngl` argument | ||||
|  | ||||
|   | ||||
| @@ -17,39 +17,37 @@ | ||||
| #pragma warning(disable: 4244 4267) // possible loss of data | ||||
| #endif | ||||
|  | ||||
| static void print_usage(int argc, char ** argv, const gpt_params & params) { | ||||
|     gpt_params_print_usage(argc, argv, params); | ||||
|  | ||||
|     LOG_TEE("\nexample usage:\n"); | ||||
|     LOG_TEE("\n    %s \\\n" | ||||
|             "       -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \\\n" | ||||
|             "       [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n" | ||||
|             "       [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]); | ||||
|     LOG_TEE("\n"); | ||||
| } | ||||
|  | ||||
| struct Stats { | ||||
|     std::vector<float> values; | ||||
|     std::vector<int> counts; | ||||
|     int ncall = 0; | ||||
| }; | ||||
|  | ||||
| struct StatParams { | ||||
|     std::string dataset; | ||||
|     std::string ofile = "imatrix.dat"; | ||||
|     int         n_output_frequency = 10; | ||||
|     int         verbosity = 1; | ||||
|     int         keep_every = 0; | ||||
|     bool        collect_output_weight = false; | ||||
| }; | ||||
|  | ||||
| class IMatrixCollector { | ||||
| public: | ||||
|     IMatrixCollector() = default; | ||||
|     void set_parameters(StatParams&& params) { m_params = std::move(params); } | ||||
|     void set_params(gpt_params params) { m_params = std::move(params); } | ||||
|     bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data); | ||||
|     void save_imatrix() const; | ||||
|     bool load_imatrix(const char * file_name, bool add); | ||||
|     static bool load_imatrix(const char * file_name, std::unordered_map<std::string, Stats>& imatrix); | ||||
|     void save_imatrix(int ncall = -1) const; | ||||
|     bool load_imatrix(const char * file_name); | ||||
| private: | ||||
|     std::unordered_map<std::string, Stats> m_stats; | ||||
|     StatParams                             m_params; | ||||
|     gpt_params                             m_params; | ||||
|     std::mutex                             m_mutex; | ||||
|     int                                    m_last_call = 0; | ||||
|     std::vector<float>                     m_src1_data; | ||||
|     std::vector<char>                      m_ids; // the expert ids from ggml_mul_mat_id | ||||
|                                                   // | ||||
|     void save_imatrix(const char * file_name, const char * dataset) const; | ||||
|     void keep_imatrix(int ncall) const; | ||||
| }; | ||||
|  | ||||
| // remove any prefix and suffixes from the name | ||||
| @@ -85,7 +83,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * | ||||
|         if (t->op != GGML_OP_MUL_MAT) return false; | ||||
|         // why are small batches ignored (<16 tokens)? | ||||
|         if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false; | ||||
|         if (!(wname.substr(0, 4) == "blk." || (m_params.collect_output_weight && wname == "output.weight"))) return false; | ||||
|         if (!(wname.substr(0, 4) == "blk." || (m_params.process_output && wname == "output.weight"))) return false; | ||||
|         return true; | ||||
|     } | ||||
|  | ||||
| @@ -158,16 +156,16 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * | ||||
|             } | ||||
|             if (e.ncall > m_last_call) { | ||||
|                 m_last_call = e.ncall; | ||||
|                 if (m_last_call % m_params.n_output_frequency == 0) { | ||||
|                 if (m_last_call % m_params.n_out_freq == 0) { | ||||
|                     save_imatrix(); | ||||
|                 } | ||||
|                 if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) { | ||||
|                     keep_imatrix(m_last_call); | ||||
|                 if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) { | ||||
|                     save_imatrix(m_last_call); | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|     } else { | ||||
|         auto& e = m_stats[wname]; | ||||
|         auto & e = m_stats[wname]; | ||||
|         if (e.values.empty()) { | ||||
|             e.values.resize(src1->ne[0], 0); | ||||
|             e.counts.resize(src1->ne[0], 0); | ||||
| @@ -189,11 +187,11 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * | ||||
|         } | ||||
|         if (e.ncall > m_last_call) { | ||||
|             m_last_call = e.ncall; | ||||
|             if (m_last_call % m_params.n_output_frequency == 0) { | ||||
|             if (m_last_call % m_params.n_out_freq == 0) { | ||||
|                 save_imatrix(); | ||||
|             } | ||||
|             if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) { | ||||
|                 keep_imatrix(m_last_call); | ||||
|             if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) { | ||||
|                 save_imatrix(m_last_call); | ||||
|             } | ||||
|         } | ||||
|     } | ||||
| @@ -201,19 +199,17 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * | ||||
|     return true; | ||||
| } | ||||
|  | ||||
| void IMatrixCollector::save_imatrix() const { | ||||
|     save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str(), m_params.dataset.c_str()); | ||||
| } | ||||
| void IMatrixCollector::save_imatrix(int ncall) const { | ||||
|     auto fname = m_params.out_file; | ||||
|     if (fname.empty()) { | ||||
|         fname = "imatrix.dat"; | ||||
|     } | ||||
|  | ||||
| void IMatrixCollector::keep_imatrix(int ncall) const { | ||||
|     auto file_name = m_params.ofile; | ||||
|     if (file_name.empty()) file_name = "imatrix.dat"; | ||||
|     file_name += ".at_"; | ||||
|     file_name += std::to_string(ncall); | ||||
|     save_imatrix(file_name.c_str(), m_params.dataset.c_str()); | ||||
| } | ||||
|     if (ncall > 0) { | ||||
|         fname += ".at_"; | ||||
|         fname += std::to_string(ncall); | ||||
|     } | ||||
|  | ||||
| void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) const { | ||||
|     std::ofstream out(fname, std::ios::binary); | ||||
|     int n_entries = m_stats.size(); | ||||
|     out.write((const char *) &n_entries, sizeof(n_entries)); | ||||
| @@ -236,26 +232,28 @@ void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) co | ||||
|     // Write the number of call the matrix was computed with | ||||
|     out.write((const char *) &m_last_call, sizeof(m_last_call)); | ||||
|  | ||||
|     // Write the dataset name at the end of the file to later on specify it in quantize | ||||
|     int n_dataset = strlen(dataset); | ||||
|     out.write((const char *) &n_dataset, sizeof(n_dataset)); | ||||
|     out.write(dataset, n_dataset); | ||||
|     // Write the input filename at the end of the file to later on specify it in quantize | ||||
|     { | ||||
|         int len = m_params.prompt_file.size(); | ||||
|         out.write((const char *) &len, sizeof(len)); | ||||
|         out.write(m_params.prompt_file.c_str(), len); | ||||
|     } | ||||
|  | ||||
|     if (m_params.verbosity > 0) { | ||||
|         fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname); | ||||
|         fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str()); | ||||
|     } | ||||
| } | ||||
|  | ||||
| bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_map<std::string, Stats>& imatrix_data) { | ||||
|     std::ifstream in(imatrix_file, std::ios::binary); | ||||
| bool IMatrixCollector::load_imatrix(const char * fname) { | ||||
|     std::ifstream in(fname, std::ios::binary); | ||||
|     if (!in) { | ||||
|         printf("%s: failed to open %s\n",__func__,imatrix_file); | ||||
|         printf("%s: failed to open %s\n",__func__, fname); | ||||
|         return false; | ||||
|     } | ||||
|     int n_entries; | ||||
|     in.read((char*)&n_entries, sizeof(n_entries)); | ||||
|     if (in.fail() || n_entries < 1) { | ||||
|         printf("%s: no data in file %s\n", __func__, imatrix_file); | ||||
|         printf("%s: no data in file %s\n", __func__, fname); | ||||
|         return false; | ||||
|     } | ||||
|     for (int i = 0; i < n_entries; ++i) { | ||||
| @@ -263,23 +261,22 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma | ||||
|         std::vector<char> name_as_vec(len+1); | ||||
|         in.read((char *)name_as_vec.data(), len); | ||||
|         if (in.fail()) { | ||||
|             printf("%s: failed reading name for entry %d from %s\n",__func__,i+1,imatrix_file); | ||||
|             printf("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname); | ||||
|             return false; | ||||
|         } | ||||
|         name_as_vec[len] = 0; | ||||
|         std::string name{name_as_vec.data()}; | ||||
|         auto& e = imatrix_data[std::move(name)]; | ||||
|         auto & e = m_stats[std::move(name)]; | ||||
|         int ncall; | ||||
|         in.read((char*)&ncall, sizeof(ncall)); | ||||
|         int nval; | ||||
|         in.read((char *)&nval, sizeof(nval)); | ||||
|         if (in.fail() || nval < 1) { | ||||
|             printf("%s: failed reading number of values for entry %d\n",__func__,i); | ||||
|             imatrix_data = {}; | ||||
|             m_stats = {}; | ||||
|             return false; | ||||
|         } | ||||
|  | ||||
|         // When re-called from load_imatrix() with add set, this will already be created. | ||||
|         if (e.values.empty()) { | ||||
|             e.values.resize(nval, 0); | ||||
|             e.counts.resize(nval, 0); | ||||
| @@ -289,7 +286,7 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma | ||||
|         in.read((char*)tmp.data(), nval*sizeof(float)); | ||||
|         if (in.fail()) { | ||||
|             printf("%s: failed reading data for entry %d\n",__func__,i); | ||||
|             imatrix_data = {}; | ||||
|             m_stats = {}; | ||||
|             return false; | ||||
|         } | ||||
|  | ||||
| @@ -304,13 +301,6 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma | ||||
|     return true; | ||||
| } | ||||
|  | ||||
| bool IMatrixCollector::load_imatrix(const char * file_name, bool add) { | ||||
|     if (!add) { | ||||
|         m_stats.clear(); | ||||
|     } | ||||
|     return load_imatrix(file_name, m_stats); | ||||
| } | ||||
|  | ||||
| static IMatrixCollector g_collector; | ||||
|  | ||||
| static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) { | ||||
| @@ -324,7 +314,7 @@ struct results_log_softmax { | ||||
|     float  prob; | ||||
| }; | ||||
|  | ||||
| static std::vector<float> softmax(const std::vector<float>& logits) { | ||||
| static std::vector<float> softmax(const std::vector<float> & logits) { | ||||
|     std::vector<float> probs(logits.size()); | ||||
|     float max_logit = logits[0]; | ||||
|     for (float v : logits) { | ||||
| @@ -358,8 +348,7 @@ static results_log_softmax log_softmax(int n_vocab, const float * logits, int to | ||||
|  | ||||
| static void process_logits( | ||||
|     int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers, | ||||
|     double & nll, double & nll2, float * logit_history, float * prob_history | ||||
| ) { | ||||
|     double & nll, double & nll2, float * logit_history, float * prob_history) { | ||||
|     std::mutex mutex; | ||||
|     int counter = 0; | ||||
|     auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () { | ||||
| @@ -391,8 +380,7 @@ static void process_logits( | ||||
|     } | ||||
| } | ||||
|  | ||||
| static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl, int from_chunk) { | ||||
|  | ||||
| static bool compute_imatrix(llama_context * ctx, const gpt_params & params) { | ||||
|     const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); | ||||
|     GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1); | ||||
|     const int n_ctx = llama_n_ctx(ctx); | ||||
| @@ -405,13 +393,13 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool | ||||
|     auto tim2 = std::chrono::high_resolution_clock::now(); | ||||
|     fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count()); | ||||
|  | ||||
|     if (from_chunk > 0) { | ||||
|         if (size_t((from_chunk + 2)*n_ctx) >= tokens.size()) { | ||||
|             fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, from_chunk); | ||||
|     if (params.i_chunk > 0) { | ||||
|         if (size_t((params.i_chunk + 2)*n_ctx) >= tokens.size()) { | ||||
|             fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk); | ||||
|             return false; | ||||
|         } | ||||
|         fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, from_chunk, from_chunk*n_ctx); | ||||
|         tokens.erase(tokens.begin(), tokens.begin() + from_chunk*n_ctx); | ||||
|         fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx); | ||||
|         tokens.erase(tokens.begin(), tokens.begin() + params.i_chunk*n_ctx); | ||||
|     } | ||||
|  | ||||
|     if (int(tokens.size()) < 2*n_ctx) { | ||||
| @@ -424,7 +412,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool | ||||
|     std::vector<float> logit_history; | ||||
|     std::vector<float> prob_history; | ||||
|  | ||||
|     if (compute_ppl) { | ||||
|     if (params.compute_ppl) { | ||||
|         logit_history.resize(tokens.size()); | ||||
|         prob_history.resize(tokens.size()); | ||||
|     } | ||||
| @@ -446,7 +434,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool | ||||
|     const int num_batches = (n_ctx + n_batch - 1) / n_batch; | ||||
|  | ||||
|     std::vector<float> logits; | ||||
|     if (compute_ppl && num_batches > 1) { | ||||
|     if (params.compute_ppl && num_batches > 1) { | ||||
|         logits.reserve((size_t)n_ctx * n_vocab); | ||||
|     } | ||||
|  | ||||
| @@ -482,7 +470,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool | ||||
|             // restore the original token in case it was set to BOS | ||||
|             tokens[batch_start] = token_org; | ||||
|  | ||||
|             if (compute_ppl && num_batches > 1) { | ||||
|             if (params.compute_ppl && num_batches > 1) { | ||||
|                 const auto * batch_logits = llama_get_logits(ctx); | ||||
|                 logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab); | ||||
|             } | ||||
| @@ -501,7 +489,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool | ||||
|             fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0); | ||||
|         } | ||||
|  | ||||
|         if (compute_ppl) { | ||||
|         if (params.compute_ppl) { | ||||
|             const int first = n_ctx/2; | ||||
|             const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx); | ||||
|             process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first, | ||||
| @@ -516,7 +504,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool | ||||
|     } | ||||
|     printf("\n"); | ||||
|  | ||||
|     if (compute_ppl) { | ||||
|     if (params.compute_ppl) { | ||||
|         nll2 /= count; | ||||
|         nll /= count; | ||||
|         const double ppl = exp(nll); | ||||
| @@ -533,109 +521,32 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool | ||||
| } | ||||
|  | ||||
| int main(int argc, char ** argv) { | ||||
|     StatParams sparams; | ||||
|     std::string prev_result_file; | ||||
|     std::string combine_files; | ||||
|     bool compute_ppl = true; | ||||
|     int  from_chunk  = 0; | ||||
|     std::vector<char*> args; | ||||
|     args.push_back(argv[0]); | ||||
|     int iarg = 1; | ||||
|     for (; iarg < argc-1; ++iarg) { | ||||
|         std::string arg{argv[iarg]}; | ||||
|         if (arg == "-o" || arg == "--output-file") { | ||||
|             sparams.ofile = argv[++iarg]; | ||||
|         } | ||||
|         else if (arg == "-ofreq" || arg == "--output-frequency") { | ||||
|             sparams.n_output_frequency = std::stoi(argv[++iarg]); | ||||
|         } | ||||
|         else if (arg == "-ow" || arg == "--output-weight") { | ||||
|             sparams.collect_output_weight = std::stoi(argv[++iarg]); | ||||
|         } | ||||
|         else if (arg == "--verbosity") { | ||||
|             sparams.verbosity = std::stoi(argv[++iarg]); | ||||
|         } else if (arg == "--no-ppl") { | ||||
|             compute_ppl = false; | ||||
|         } else if (arg == "--keep-imatrix") { | ||||
|             sparams.keep_every = std::stoi(argv[++iarg]); | ||||
|         } else if (arg == "--continue-from") { | ||||
|             prev_result_file = argv[++iarg]; | ||||
|         } else if (arg == "--combine") { | ||||
|             combine_files = argv[++iarg]; | ||||
|         } | ||||
|         else if (arg == "--from-chunk") { | ||||
|             from_chunk = std::stoi(argv[++iarg]); | ||||
|         } else { | ||||
|             args.push_back(argv[iarg]); | ||||
|         } | ||||
|     } | ||||
|     if (iarg < argc) { | ||||
|         std::string arg{argv[iarg]}; | ||||
|         if (arg == "--no-ppl") { | ||||
|             compute_ppl = false; | ||||
|         } else { | ||||
|             args.push_back(argv[iarg]); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     gpt_params params; | ||||
|     params.n_batch = 512; | ||||
|  | ||||
|     params.n_ctx = 512; | ||||
|     params.logits_all = true; | ||||
|     params.verbosity = 1; | ||||
|  | ||||
|     if (!gpt_params_parse(argc, argv, params)) { | ||||
|         gpt_params_print_usage(argc, argv, params); | ||||
|         print_usage(argc, argv, params); | ||||
|         return 1; | ||||
|     } | ||||
|  | ||||
|     params.logits_all = true; | ||||
|     params.n_batch = std::min(params.n_batch, params.n_ctx); | ||||
|  | ||||
|     print_build_info(); | ||||
|     g_collector.set_params(params); | ||||
|  | ||||
|     if (params.seed == LLAMA_DEFAULT_SEED) { | ||||
|         params.seed = time(NULL); | ||||
|     } | ||||
|  | ||||
|     fprintf(stderr, "%s: seed  = %u\n", __func__, params.seed); | ||||
|  | ||||
|     std::mt19937 rng(params.seed); | ||||
|  | ||||
|     sparams.dataset = params.prompt_file; | ||||
|     g_collector.set_parameters(std::move(sparams)); | ||||
|  | ||||
|     if (!combine_files.empty()) { | ||||
|         std::vector<std::string> files; | ||||
|         size_t pos = 0; | ||||
|         while (true) { | ||||
|             auto new_pos = combine_files.find(',', pos); | ||||
|             if (new_pos != std::string::npos) { | ||||
|                 files.emplace_back(combine_files.substr(pos, new_pos - pos)); | ||||
|                 pos = new_pos + 1; | ||||
|             } else { | ||||
|                 files.emplace_back(combine_files.substr(pos)); | ||||
|                 break; | ||||
|             } | ||||
|         } | ||||
|         if (files.size() < 2) { | ||||
|             fprintf(stderr, "You must provide at least two comma separated files to use --combine\n"); | ||||
|     for (const auto & in_file : params.in_files) { | ||||
|         printf("%s : loading imatrix from '%s'\n", __func__, in_file.c_str()); | ||||
|         if (!g_collector.load_imatrix(in_file.c_str())) { | ||||
|             fprintf(stderr, "%s : failed to load %s\n", __func__, in_file.c_str()); | ||||
|             return 1; | ||||
|         } | ||||
|         printf("Combining the following %d files\n", int(files.size())); | ||||
|         for (auto& file : files) { | ||||
|             printf("    %s\n", file.c_str()); | ||||
|             if (!g_collector.load_imatrix(file.c_str(), true)) { | ||||
|                 fprintf(stderr, "Failed to load %s\n", file.c_str()); | ||||
|                 return 1; | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     if (params.in_files.size() > 1) { | ||||
|         printf("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str()); | ||||
|         g_collector.save_imatrix(); | ||||
|         return 0; | ||||
|     } | ||||
|  | ||||
|     if (!prev_result_file.empty()) { | ||||
|         if (!g_collector.load_imatrix(prev_result_file.c_str(), false)) { | ||||
|             fprintf(stderr, "=============== Failed to load %s\n", prev_result_file.c_str()); | ||||
|             return 1; | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     llama_backend_init(); | ||||
| @@ -650,6 +561,7 @@ int main(int argc, char ** argv) { | ||||
|     // init | ||||
|     llama_model * model; | ||||
|     llama_context * ctx; | ||||
|  | ||||
|     std::tie(model, ctx) = llama_init_from_gpt_params(params); | ||||
|     if (model == nullptr || ctx == nullptr) { | ||||
|         fprintf(stderr, "%s : failed to init\n", __func__); | ||||
| @@ -668,8 +580,7 @@ int main(int argc, char ** argv) { | ||||
|         fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str()); | ||||
|     } | ||||
|  | ||||
|     bool OK = compute_imatrix(ctx, params, compute_ppl, from_chunk); | ||||
|     if (!OK) { | ||||
|     if (!compute_imatrix(ctx, params)) { | ||||
|         return 1; | ||||
|     } | ||||
|  | ||||
|   | ||||
| @@ -2360,7 +2360,7 @@ int main(int argc, char ** argv) { | ||||
|  | ||||
|     // TODO: not great to use extern vars | ||||
|     server_log_json = params.log_json; | ||||
|     server_verbose = params.verbose; | ||||
|     server_verbose = params.verbosity > 0; | ||||
|  | ||||
|     // struct that contains llama context and inference | ||||
|     server_context ctx_server; | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov