mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-03 09:22:01 +00:00 
			
		
		
		
	Only show -ngl option when relevant + other doc/arg handling updates (#1625)
1. Add a `LLAMA_SUPPORTS_GPU_OFFLOAD` define to `llama.h` (defined when compiled with CLBlast or cuBLAS) 2. Update the argument handling in the common example code to only show the `-ngl`, `--n-gpu-layers` option when GPU offload is possible. 3. Add an entry for the `-ngl`, `--n-gpu-layers` option to the `main` and `server` examples documentation 4. Update `main` and `server` examples documentation to use the new style dash separator argument format 5. Update the `server` example to use dash separators for its arguments and adds `-ngl` to `--help` (only shown when compiled with appropriate support). It will still support `--memory_f32` and `--ctx_size` for compatibility. 6. Add a warning discouraging use of `--memory-f32` for the `main` and `server` examples `--help` text as well as documentation. Rationale: https://github.com/ggerganov/llama.cpp/discussions/1593#discussioncomment-6004356
This commit is contained in:
		@@ -285,7 +285,8 @@ Test();
 | 
			
		||||
## Common Options
 | 
			
		||||
 | 
			
		||||
-   `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
 | 
			
		||||
-   `-c N, --ctx_size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
 | 
			
		||||
-   `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
 | 
			
		||||
-   `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
 | 
			
		||||
-   `--embedding`: Enable the embedding mode. **Completion function doesn't work in this mode**.
 | 
			
		||||
-   `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`;
 | 
			
		||||
-   `--port`: Set the port to listen. Default: `8080`.
 | 
			
		||||
@@ -304,7 +305,7 @@ The RNG seed is used to initialize the random number generator that influences t
 | 
			
		||||
 | 
			
		||||
### Memory Float 32
 | 
			
		||||
 | 
			
		||||
-   `--memory_f32`: Use 32-bit floats instead of 16-bit floats for memory key+value, allowing higher quality inference at the cost of higher memory usage.
 | 
			
		||||
-   `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. This doubles the context memory requirement but does not appear to increase generation quality in a measurable way. Not recommended.
 | 
			
		||||
 | 
			
		||||
## Limitations:
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -385,7 +385,9 @@ void server_print_usage(int /*argc*/, char **argv, const gpt_params ¶ms)
 | 
			
		||||
  fprintf(stderr, "options:\n");
 | 
			
		||||
  fprintf(stderr, "  -h, --help            show this help message and exit\n");
 | 
			
		||||
  fprintf(stderr, "  -s SEED, --seed SEED  RNG seed (default: -1, use random seed for < 0)\n");
 | 
			
		||||
  fprintf(stderr, "  --memory_f32          use f32 instead of f16 for memory key+value\n");
 | 
			
		||||
  fprintf(stderr, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
 | 
			
		||||
  fprintf(stderr, "  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
 | 
			
		||||
  fprintf(stderr, "                        not recommended: doubles context memory required and no measurable increase in quality\n");
 | 
			
		||||
  fprintf(stderr, "  --embedding           enable embedding mode\n");
 | 
			
		||||
  fprintf(stderr, "  --keep                number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
 | 
			
		||||
  if (llama_mlock_supported())
 | 
			
		||||
@@ -396,8 +398,10 @@ void server_print_usage(int /*argc*/, char **argv, const gpt_params ¶ms)
 | 
			
		||||
  {
 | 
			
		||||
    fprintf(stderr, "  --no-mmap             do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
 | 
			
		||||
  }
 | 
			
		||||
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
 | 
			
		||||
  fprintf(stderr, "  -ngl N, --n-gpu-layers N\n");
 | 
			
		||||
  fprintf(stderr, "                        number of layers to store in VRAM\n");
 | 
			
		||||
#endif
 | 
			
		||||
  fprintf(stderr, "  -m FNAME, --model FNAME\n");
 | 
			
		||||
  fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
 | 
			
		||||
  fprintf(stderr, "  -a ALIAS, --alias ALIAS\n");
 | 
			
		||||
@@ -473,7 +477,7 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para
 | 
			
		||||
      server_print_usage(argc, argv, default_params);
 | 
			
		||||
      exit(0);
 | 
			
		||||
    }
 | 
			
		||||
    else if (arg == "-c" || arg == "--ctx_size")
 | 
			
		||||
    else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size")
 | 
			
		||||
    {
 | 
			
		||||
      if (++i >= argc)
 | 
			
		||||
      {
 | 
			
		||||
@@ -482,7 +486,7 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para
 | 
			
		||||
      }
 | 
			
		||||
      params.n_ctx = std::stoi(argv[i]);
 | 
			
		||||
    }
 | 
			
		||||
    else if (arg == "--memory_f32")
 | 
			
		||||
    else if (arg == "--memory-f32" || arg == "--memory_f32")
 | 
			
		||||
    {
 | 
			
		||||
      params.memory_f16 = false;
 | 
			
		||||
    }
 | 
			
		||||
@@ -493,7 +497,12 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para
 | 
			
		||||
        invalid_param = true;
 | 
			
		||||
        break;
 | 
			
		||||
      }
 | 
			
		||||
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
 | 
			
		||||
      params.n_gpu_layers = std::stoi(argv[i]);
 | 
			
		||||
#else
 | 
			
		||||
      fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
 | 
			
		||||
      fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
 | 
			
		||||
#endif
 | 
			
		||||
    }
 | 
			
		||||
    else
 | 
			
		||||
    {
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user