mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	ci : update ".bin" to ".gguf" extension
ggml-ci
This commit is contained in:
		
							
								
								
									
										22
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										22
									
								
								README.md
									
									
									
									
									
								
							| @@ -284,7 +284,7 @@ When built with Metal support, you can enable GPU inference with the `--gpu-laye | ||||
| Any value larger than 0 will offload the computation to the GPU. For example: | ||||
|  | ||||
| ```bash | ||||
| ./main -m ./models/7B/ggml-model-q4_0.bin -n 128 -ngl 1 | ||||
| ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128 -ngl 1 | ||||
| ``` | ||||
|  | ||||
| ### MPI Build | ||||
| @@ -323,7 +323,7 @@ The above will distribute the computation across 2 processes on the first host a | ||||
| Finally, you're ready to run a computation using `mpirun`: | ||||
|  | ||||
| ```bash | ||||
| mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.bin -n 128 | ||||
| mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128 | ||||
| ``` | ||||
|  | ||||
| ### BLAS Build | ||||
| @@ -506,10 +506,10 @@ python3 convert.py models/7B/ | ||||
|   python convert.py models/7B/ --vocabtype bpe | ||||
|  | ||||
| # quantize the model to 4-bits (using q4_0 method) | ||||
| ./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin q4_0 | ||||
| ./quantize ./models/7B/ggml-model-f16.gguf ./models/7B/ggml-model-q4_0.gguf q4_0 | ||||
|  | ||||
| # run the inference | ||||
| ./main -m ./models/7B/ggml-model-q4_0.bin -n 128 | ||||
| ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128 | ||||
| ``` | ||||
|  | ||||
| When running the larger models, make sure you have enough disk space to store all the intermediate files. | ||||
| @@ -565,7 +565,7 @@ Here is an example of a few-shot interaction, invoked with the command | ||||
| ./examples/chat-13B.sh | ||||
|  | ||||
| # custom arguments using a 13B model | ||||
| ./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt | ||||
| ./main -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt | ||||
| ``` | ||||
|  | ||||
| Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `main` example program. | ||||
| @@ -628,6 +628,8 @@ OpenLLaMA is an openly licensed reproduction of Meta's original LLaMA model. It | ||||
|  | ||||
| ### Using [GPT4All](https://github.com/nomic-ai/gpt4all) | ||||
|  | ||||
| *Note: these instructions are likely obsoleted by the GGUF update* | ||||
|  | ||||
| - Obtain the `tokenizer.model` file from LLaMA model and put it to `models` | ||||
| - Obtain the `added_tokens.json` file from Alpaca model and put it to `models` | ||||
| - Obtain the `gpt4all-lora-quantized.bin` file from GPT4All model and put it to `models/gpt4all-7B` | ||||
| @@ -703,7 +705,7 @@ If your issue is with model generation quality, then please at least scan the fo | ||||
| #### How to run | ||||
|  | ||||
| 1. Download/extract: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research | ||||
| 2. Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw` | ||||
| 2. Run `./perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw` | ||||
| 3. Output: | ||||
| ``` | ||||
| perplexity : calculating perplexity over 655 chunks | ||||
| @@ -802,13 +804,13 @@ docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in- | ||||
| On completion, you are ready to play! | ||||
|  | ||||
| ```bash | ||||
| docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 | ||||
| docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 | ||||
| ``` | ||||
|  | ||||
| or with a light image: | ||||
|  | ||||
| ```bash | ||||
| docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 | ||||
| docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 | ||||
| ``` | ||||
|  | ||||
| ### Docker With CUDA | ||||
| @@ -839,8 +841,8 @@ The resulting images, are essentially the same as the non-CUDA images: | ||||
| After building locally, Usage is similar to the non-CUDA examples, but you'll need to add the `--gpus` flag. You will also want to use the `--n-gpu-layers` flag. | ||||
|  | ||||
| ```bash | ||||
| docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1 | ||||
| docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1 | ||||
| docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1 | ||||
| docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1 | ||||
| ``` | ||||
|  | ||||
| ### Contributing | ||||
|   | ||||
							
								
								
									
										44
									
								
								ci/run.sh
									
									
									
									
									
								
							
							
						
						
									
										44
									
								
								ci/run.sh
									
									
									
									
									
								
							| @@ -159,17 +159,17 @@ function gg_run_open_llama_3b_v2 { | ||||
|  | ||||
|     python3 ../convert.py ${path_models} | ||||
|  | ||||
|     model_f16="${path_models}/ggml-model-f16.bin" | ||||
|     model_q8_0="${path_models}/ggml-model-q8_0.bin" | ||||
|     model_q4_0="${path_models}/ggml-model-q4_0.bin" | ||||
|     model_q4_1="${path_models}/ggml-model-q4_1.bin" | ||||
|     model_q5_0="${path_models}/ggml-model-q5_0.bin" | ||||
|     model_q5_1="${path_models}/ggml-model-q5_1.bin" | ||||
|     model_q2_k="${path_models}/ggml-model-q2_k.bin" | ||||
|     model_q3_k="${path_models}/ggml-model-q3_k.bin" | ||||
|     model_q4_k="${path_models}/ggml-model-q4_k.bin" | ||||
|     model_q5_k="${path_models}/ggml-model-q5_k.bin" | ||||
|     model_q6_k="${path_models}/ggml-model-q6_k.bin" | ||||
|     model_f16="${path_models}/ggml-model-f16.gguf" | ||||
|     model_q8_0="${path_models}/ggml-model-q8_0.gguf" | ||||
|     model_q4_0="${path_models}/ggml-model-q4_0.gguf" | ||||
|     model_q4_1="${path_models}/ggml-model-q4_1.gguf" | ||||
|     model_q5_0="${path_models}/ggml-model-q5_0.gguf" | ||||
|     model_q5_1="${path_models}/ggml-model-q5_1.gguf" | ||||
|     model_q2_k="${path_models}/ggml-model-q2_k.gguf" | ||||
|     model_q3_k="${path_models}/ggml-model-q3_k.gguf" | ||||
|     model_q4_k="${path_models}/ggml-model-q4_k.gguf" | ||||
|     model_q5_k="${path_models}/ggml-model-q5_k.gguf" | ||||
|     model_q6_k="${path_models}/ggml-model-q6_k.gguf" | ||||
|  | ||||
|     wiki_test_60="${path_wiki}/wiki.test-60.raw" | ||||
|  | ||||
| @@ -285,17 +285,17 @@ function gg_run_open_llama_7b_v2 { | ||||
|  | ||||
|     python3 ../convert.py ${path_models} | ||||
|  | ||||
|     model_f16="${path_models}/ggml-model-f16.bin" | ||||
|     model_q8_0="${path_models}/ggml-model-q8_0.bin" | ||||
|     model_q4_0="${path_models}/ggml-model-q4_0.bin" | ||||
|     model_q4_1="${path_models}/ggml-model-q4_1.bin" | ||||
|     model_q5_0="${path_models}/ggml-model-q5_0.bin" | ||||
|     model_q5_1="${path_models}/ggml-model-q5_1.bin" | ||||
|     model_q2_k="${path_models}/ggml-model-q2_k.bin" | ||||
|     model_q3_k="${path_models}/ggml-model-q3_k.bin" | ||||
|     model_q4_k="${path_models}/ggml-model-q4_k.bin" | ||||
|     model_q5_k="${path_models}/ggml-model-q5_k.bin" | ||||
|     model_q6_k="${path_models}/ggml-model-q6_k.bin" | ||||
|     model_f16="${path_models}/ggml-model-f16.gguf" | ||||
|     model_q8_0="${path_models}/ggml-model-q8_0.gguf" | ||||
|     model_q4_0="${path_models}/ggml-model-q4_0.gguf" | ||||
|     model_q4_1="${path_models}/ggml-model-q4_1.gguf" | ||||
|     model_q5_0="${path_models}/ggml-model-q5_0.gguf" | ||||
|     model_q5_1="${path_models}/ggml-model-q5_1.gguf" | ||||
|     model_q2_k="${path_models}/ggml-model-q2_k.gguf" | ||||
|     model_q3_k="${path_models}/ggml-model-q3_k.gguf" | ||||
|     model_q4_k="${path_models}/ggml-model-q4_k.gguf" | ||||
|     model_q5_k="${path_models}/ggml-model-q5_k.gguf" | ||||
|     model_q6_k="${path_models}/ggml-model-q6_k.gguf" | ||||
|  | ||||
|     wiki_test="${path_wiki}/wiki.test.raw" | ||||
|  | ||||
|   | ||||
| @@ -3,7 +3,7 @@ | ||||
| ## Verifying that the model is running on the GPU with cuBLAS | ||||
| Make sure you compiled llama with the correct env variables according to [this guide](../README.md#cublas), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example: | ||||
| ```shell | ||||
| ./main -m "path/to/model.bin" -ngl 200000 -p "Please sir, may I have some " | ||||
| ./main -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some " | ||||
| ``` | ||||
|  | ||||
| When running llama, before it starts the inference work, it will output diagnostic information that shows whether cuBLAS is offloading work to the GPU. Look for these lines: | ||||
| @@ -25,9 +25,9 @@ GPU: A6000 (48GB VRAM) | ||||
| CPU: 7 physical cores | ||||
| RAM: 32GB | ||||
|  | ||||
| Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.ggmlv3.q4_0.bin` (30B parameters, 4bit quantization, GGML) | ||||
| Model: `TheBloke_Wizard-Vicuna-30B-Uncensored-GGML/Wizard-Vicuna-30B-Uncensored.q4_0.gguf` (30B parameters, 4bit quantization, GGML) | ||||
|  | ||||
| Run command: `./main -m "path/to/model.bin" -p "-p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]` | ||||
| Run command: `./main -m "path/to/model.gguf" -p "An extremely detailed description of the 10 best ethnic dishes will follow, with recipes: " -n 1000 [additional benchmark flags]` | ||||
|  | ||||
| Result: | ||||
|  | ||||
|   | ||||
| @@ -52,7 +52,7 @@ struct gpt_params { | ||||
|     std::string cfg_negative_prompt;       // string to help guidance | ||||
|     float       cfg_scale         = 1.f;   // How strong is guidance | ||||
|  | ||||
|     std::string model             = "models/7B/ggml-model.bin"; // model path | ||||
|     std::string model             = "models/7B/ggml-model-f16.bin"; // model path | ||||
|     std::string model_alias       = "unknown"; // model alias | ||||
|     std::string prompt            = ""; | ||||
|     std::string path_prompt_cache = "";  // path to file for saving/loading prompt eval state | ||||
|   | ||||
| @@ -2,7 +2,7 @@ | ||||
| // | ||||
| // - First, export a LLaMA graph: | ||||
| // | ||||
| //  $ ./bin/main -m ../models/7B/ggml-model-q4_0.bin --export | ||||
| //  $ ./bin/main -m ../models/7B/ggml-model-q4_0.gguf --export | ||||
| // | ||||
| // - Run this tool to evaluate the exported graph: | ||||
| // | ||||
|   | ||||
| @@ -5,7 +5,7 @@ This example demonstrates a simple HTTP API server and a simple web front end to | ||||
| Command line options: | ||||
|  | ||||
| -   `--threads N`, `-t N`: Set the number of threads to use during computation. | ||||
| -   `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`). | ||||
| -   `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`). | ||||
| -   `-m ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses. | ||||
| -   `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096. | ||||
| -   `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance. | ||||
| @@ -48,14 +48,12 @@ To get started right away, run the following command, making sure to use the cor | ||||
| ### Unix-based systems (Linux, macOS, etc.): | ||||
|  | ||||
| ```bash | ||||
| ./server -m models/7B/ggml-model.bin -c 2048 | ||||
| ./server -m models/7B/ggml-model.gguf -c 2048 | ||||
| ``` | ||||
|  | ||||
| ### Windows: | ||||
|  | ||||
| ```powershell | ||||
| server.exe -m models\7B\ggml-model.bin -c 2048 | ||||
| ``` | ||||
|  | ||||
| The above command will start a server that by default listens on `127.0.0.1:8080`. | ||||
| You can consume the endpoints with Postman or NodeJS with axios library. You can visit the web front end at the same url. | ||||
|   | ||||
| @@ -3575,7 +3575,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s | ||||
|             } else { | ||||
|                 size_t counter = 0; | ||||
|                 new_size = 0; | ||||
|                 auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements] () { | ||||
|                 auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size]() { // NOLINT | ||||
|                     std::vector<int64_t> local_hist; | ||||
|                     size_t local_size = 0; | ||||
|                     while (true) { | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov