mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	Merge remote-tracking branch 'origin/master' into server-cfg
This commit is contained in:
		| @@ -7,6 +7,9 @@ target_compile_definitions(${TARGET} PRIVATE | ||||
|     SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}> | ||||
| ) | ||||
| target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | ||||
| if (WIN32) | ||||
|     TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32) | ||||
| endif() | ||||
| target_compile_features(${TARGET} PRIVATE cxx_std_11) | ||||
| if(TARGET BUILD_INFO) | ||||
|   add_dependencies(${TARGET} BUILD_INFO) | ||||
|   | ||||
| @@ -440,7 +440,7 @@ struct llama_server_context { | ||||
|  | ||||
|             if (cfg_enabled) { | ||||
|                 llama_sample_classifier_free_guidance( | ||||
|                     ctx, &candidates_p, evaluator_guidance.ctx, params.cfg_scale, 1.0); | ||||
|                     ctx, &candidates_p, evaluator_guidance.ctx, params.cfg_scale); | ||||
|             } | ||||
|  | ||||
|             // Apply penalties | ||||
| @@ -602,47 +602,49 @@ struct llama_server_context { | ||||
|     } | ||||
| }; | ||||
|  | ||||
| static void server_print_usage(const char * argv0, const gpt_params & params, | ||||
|                                const server_params & sparams) { | ||||
|     fprintf(stderr, "usage: %s [options]\n", argv0); | ||||
|     fprintf(stderr, "\n"); | ||||
|     fprintf(stderr, "options:\n"); | ||||
|     fprintf(stderr, "  -h, --help            show this help message and exit\n"); | ||||
|     fprintf(stderr, "  -v, --verbose         verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled"); | ||||
|     fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads); | ||||
|     fprintf(stderr, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx); | ||||
|     fprintf(stderr, "  --rope-freq-base N    RoPE base frequency (default: %.1f)\n", params.rope_freq_base); | ||||
|     fprintf(stderr, "  --rope-freq-scale N   RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale); | ||||
|     fprintf(stderr, "  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch); | ||||
|     fprintf(stderr, "  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n"); | ||||
|     fprintf(stderr, "                        not recommended: doubles context memory required and no measurable increase in quality\n"); | ||||
| static void server_print_usage(const char *argv0, const gpt_params ¶ms, | ||||
|                                const server_params &sparams) | ||||
| { | ||||
|     fprintf(stdout, "usage: %s [options]\n", argv0); | ||||
|     fprintf(stdout, "\n"); | ||||
|     fprintf(stdout, "options:\n"); | ||||
|     fprintf(stdout, "  -h, --help            show this help message and exit\n"); | ||||
|     fprintf(stdout, "  -v, --verbose         verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled"); | ||||
|     fprintf(stdout, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads); | ||||
|     fprintf(stdout, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx); | ||||
|     fprintf(stdout, "  -gqa N, --gqa N       grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa); | ||||
|     fprintf(stdout, "  --rope-freq-base N    RoPE base frequency (default: %.1f)\n", params.rope_freq_base); | ||||
|     fprintf(stdout, "  --rope-freq-scale N   RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale); | ||||
|     fprintf(stdout, "  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch); | ||||
|     fprintf(stdout, "  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n"); | ||||
|     fprintf(stdout, "                        not recommended: doubles context memory required and no measurable increase in quality\n"); | ||||
|     if (llama_mlock_supported()) { | ||||
|         fprintf(stderr, "  --mlock               force system to keep model in RAM rather than swapping or compressing\n"); | ||||
|         fprintf(stdout, "  --mlock               force system to keep model in RAM rather than swapping or compressing\n"); | ||||
|     } | ||||
|     if (llama_mmap_supported()) { | ||||
|         fprintf(stderr, "  --no-mmap             do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); | ||||
|         fprintf(stdout, "  --no-mmap             do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); | ||||
|     } | ||||
| #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD | ||||
|     fprintf(stderr, "  -ngl N, --n-gpu-layers N\n"); | ||||
|     fprintf(stderr, "                        number of layers to store in VRAM\n"); | ||||
|     fprintf(stderr, "  -ts SPLIT --tensor-split SPLIT\n"); | ||||
|     fprintf(stderr, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); | ||||
|     fprintf(stderr, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); | ||||
|     fprintf(stderr, "  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n"); | ||||
|     fprintf(stderr, "  -lv, --low-vram don't allocate VRAM scratch buffer\n"); | ||||
|     fprintf(stdout, "  -ngl N, --n-gpu-layers N\n"); | ||||
|     fprintf(stdout, "                        number of layers to store in VRAM\n"); | ||||
|     fprintf(stdout, "  -ts SPLIT --tensor-split SPLIT\n"); | ||||
|     fprintf(stdout, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); | ||||
|     fprintf(stdout, "                        how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); | ||||
|     fprintf(stdout, "  -mg i, --main-gpu i   the GPU to use for scratch and small tensors\n"); | ||||
|     fprintf(stdout, "  -lv, --low-vram don't allocate VRAM scratch buffer\n"); | ||||
| #endif | ||||
|     fprintf(stderr, "  -m FNAME, --model FNAME\n"); | ||||
|     fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str()); | ||||
|     fprintf(stderr, "  -a ALIAS, --alias ALIAS\n"); | ||||
|     fprintf(stderr, "                        set an alias for the model, will be added as `model` field in completion response\n"); | ||||
|     fprintf(stderr, "  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n"); | ||||
|     fprintf(stderr, "  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n"); | ||||
|     fprintf(stderr, "  --host                ip address to listen (default  (default: %s)\n", sparams.hostname.c_str()); | ||||
|     fprintf(stderr, "  --port PORT           port to listen (default  (default: %d)\n", sparams.port); | ||||
|     fprintf(stderr, "  --path PUBLIC_PATH    path from which to serve static files (default %s)\n", sparams.public_path.c_str()); | ||||
|     fprintf(stderr, "  -to N, --timeout N    server read/write timeout in seconds (default: %d)\n", sparams.read_timeout); | ||||
|     fprintf(stderr, "  --embedding           enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled"); | ||||
|     fprintf(stderr, "\n"); | ||||
|     fprintf(stdout, "  -m FNAME, --model FNAME\n"); | ||||
|     fprintf(stdout, "                        model path (default: %s)\n", params.model.c_str()); | ||||
|     fprintf(stdout, "  -a ALIAS, --alias ALIAS\n"); | ||||
|     fprintf(stdout, "                        set an alias for the model, will be added as `model` field in completion response\n"); | ||||
|     fprintf(stdout, "  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n"); | ||||
|     fprintf(stdout, "  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n"); | ||||
|     fprintf(stdout, "  --host                ip address to listen (default  (default: %s)\n", sparams.hostname.c_str()); | ||||
|     fprintf(stdout, "  --port PORT           port to listen (default  (default: %d)\n", sparams.port); | ||||
|     fprintf(stdout, "  --path PUBLIC_PATH    path from which to serve static files (default %s)\n", sparams.public_path.c_str()); | ||||
|     fprintf(stdout, "  -to N, --timeout N    server read/write timeout in seconds (default: %d)\n", sparams.read_timeout); | ||||
|     fprintf(stdout, "  --embedding           enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled"); | ||||
|     fprintf(stdout, "\n"); | ||||
| } | ||||
|  | ||||
| static void server_params_parse(int argc, char ** argv, server_params & sparams, | ||||
| @@ -700,6 +702,12 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams, | ||||
|                 break; | ||||
|             } | ||||
|             params.n_ctx = std::stoi(argv[i]); | ||||
|         } else if (arg == "-gqa" || arg == "--gqa") { | ||||
|             if (++i >= argc) { | ||||
|                 invalid_param = true; | ||||
|                 break; | ||||
|             } | ||||
|             params.n_gqa = std::stoi(argv[i]); | ||||
|         } else if (arg == "--rope-freq-base") { | ||||
|             if (++i >= argc) { | ||||
|                 invalid_param = true; | ||||
| @@ -739,8 +747,7 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams, | ||||
|                         "See main README.md for information on enabling GPU BLAS support", | ||||
|                         {{ "n_gpu_layers", params.n_gpu_layers }}); | ||||
| #endif | ||||
|         } | ||||
|         else if (arg == "--tensor-split" || arg == "-ts") { | ||||
|         } else if (arg == "--tensor-split" || arg == "-ts") { | ||||
|             if (++i >= argc) { | ||||
|                 invalid_param = true; | ||||
|                 break; | ||||
| @@ -770,7 +777,7 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams, | ||||
| #else | ||||
|             fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n"); | ||||
| #endif // GGML_USE_CUBLAS | ||||
|         }else if (arg == "--main-gpu" || arg == "-mg") { | ||||
|         } else if (arg == "--main-gpu" || arg == "-mg") { | ||||
|             if (++i >= argc) { | ||||
|                 invalid_param = true; | ||||
|                 break; | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Henri Vasserman
					Henri Vasserman