mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	batched-bench : add --output-format jsonl option (#9293)
				
					
				
			`--output-format` is modeled after `llama-bench`'s options
This commit is contained in:
		| @@ -1678,6 +1678,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa | ||||
|         else { invalid_param = true; } | ||||
|         return true; | ||||
|     } | ||||
|     if (arg == "--output-format") { | ||||
|         CHECK_ARG | ||||
|         std::string value(argv[i]); | ||||
|         /**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; } | ||||
|         else if (value == "md") { params.batched_bench_output_jsonl = false; } | ||||
|         else { invalid_param = true; } | ||||
|         return true; | ||||
|     } | ||||
|     if (arg == "--no-warmup") { | ||||
|         params.warmup = false; | ||||
|         return true; | ||||
| @@ -2068,6 +2076,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param | ||||
|     options.push_back({ "export-lora", "       --lora-scaled FNAME S",  "path to LoRA adapter with user defined scaling S  (can be repeated to use multiple adapters)" }); | ||||
|     options.push_back({ "export-lora", "-o,    --output FNAME",         "output file (default: '%s')", params.lora_outfile.c_str() }); | ||||
|  | ||||
|     options.push_back({ "batched-bench" }); | ||||
|     options.push_back({ "batched-bench", "       --output-format {md,jsonl}", "output format for batched-bench results (default: md)" }); | ||||
|  | ||||
|     printf("usage: %s [options]\n", argv[0]); | ||||
|  | ||||
|     for (const auto & o : options) { | ||||
|   | ||||
| @@ -275,6 +275,9 @@ struct gpt_params { | ||||
|     bool spm_infill = false; // suffix/prefix/middle pattern for infill | ||||
|  | ||||
|     std::string lora_outfile = "ggml-lora-merged-f16.gguf"; | ||||
|  | ||||
|     // batched-bench params | ||||
|     bool batched_bench_output_jsonl = false; | ||||
| }; | ||||
|  | ||||
| void gpt_params_parse_from_env(gpt_params & params); | ||||
|   | ||||
| @@ -49,3 +49,12 @@ There are 2 modes of operation: | ||||
| |   128 |    256 |    8 |   3072 |    0.751 |  1363.92 |   15.110 |   135.54 |   15.861 |   193.69 | | ||||
| |   128 |    256 |   16 |   6144 |    1.569 |  1304.93 |   18.073 |   226.64 |   19.642 |   312.80 | | ||||
| |   128 |    256 |   32 |  12288 |    3.409 |  1201.35 |   19.223 |   426.15 |   22.633 |   542.93 | | ||||
|  | ||||
| ### JSONL output | ||||
|  | ||||
| Pass `--output-format jsonl` to output JSONL instead of Markdown, á la | ||||
|  | ||||
| ```json lines | ||||
| {"n_kv_max": 2048, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "is_pp_shared": 0, "n_gpu_layers": 99, "n_threads": 8, "n_threads_batch": 8, "pp": 128, "tg": 128, "pl": 1, "n_kv": 256, "t_pp": 0.233810, "speed_pp": 547.453064, "t_tg": 3.503684, "speed_tg": 36.532974, "t": 3.737494, "speed": 68.495094} | ||||
| {"n_kv_max": 2048, "n_batch": 2048, "n_ubatch": 512, "flash_attn": 0, "is_pp_shared": 0, "n_gpu_layers": 99, "n_threads": 8, "n_threads_batch": 8, "pp": 128, "tg": 128, "pl": 2, "n_kv": 512, "t_pp": 0.422602, "speed_pp": 605.770935, "t_tg": 11.106112, "speed_tg": 23.050371, "t": 11.528713, "speed": 44.410854} | ||||
| ``` | ||||
|   | ||||
| @@ -122,12 +122,13 @@ int main(int argc, char ** argv) { | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     LOG_TEE("\n"); | ||||
|     LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch); | ||||
|     LOG_TEE("\n"); | ||||
|  | ||||
|     LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP",     "TG",     "B",    "N_KV",     "T_PP s",   "S_PP t/s", "T_TG s",   "S_TG t/s", "T s",      "S t/s"); | ||||
|     LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------"); | ||||
|     if (!params.batched_bench_output_jsonl) { | ||||
|         LOG_TEE("\n"); | ||||
|         LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch); | ||||
|         LOG_TEE("\n"); | ||||
|         LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s"); | ||||
|         LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------"); | ||||
|     } | ||||
|  | ||||
|     for (        int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) { | ||||
|         for (    int i_tg = 0; i_tg < (int) n_tg.size(); ++i_tg) { | ||||
| @@ -195,7 +196,16 @@ int main(int argc, char ** argv) { | ||||
|                 const float speed_tg = pl*tg / t_tg; | ||||
|                 const float speed    = n_kv / t; | ||||
|  | ||||
|                 LOG_TEE("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed); | ||||
|                 if(params.batched_bench_output_jsonl) { | ||||
|                     LOG_TEE( | ||||
|                         "{\"n_kv_max\": %d, \"n_batch\": %d, \"n_ubatch\": %d, \"flash_attn\": %d, \"is_pp_shared\": %d, \"n_gpu_layers\": %d, \"n_threads\": %u, \"n_threads_batch\": %u, " | ||||
|                         "\"pp\": %d, \"tg\": %d, \"pl\": %d, \"n_kv\": %d, \"t_pp\": %f, \"speed_pp\": %f, \"t_tg\": %f, \"speed_tg\": %f, \"t\": %f, \"speed\": %f}\n", | ||||
|                         n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch, | ||||
|                         pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed | ||||
|                     ); | ||||
|                 } else { | ||||
|                     LOG_TEE("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed); | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Aarni Koskela
					Aarni Koskela