mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	add gptneox gguf example
This commit is contained in:
		
							
								
								
									
										173
									
								
								convert-gptneox-h5-to-gguf.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										173
									
								
								convert-gptneox-h5-to-gguf.py
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,173 @@ | ||||
| # Quick and dirty HF gptneox--> gguf conversion | ||||
|  | ||||
| import gguf | ||||
| import sys | ||||
| import struct | ||||
| import json | ||||
| import numpy as np | ||||
| from typing import Any, List | ||||
| from pathlib import Path | ||||
| from transformers import AutoModelForCausalLM | ||||
|  | ||||
|  | ||||
| if len(sys.argv) < 3: | ||||
|     print("Usage: convert-h5-to-ggml.py dir-model ftype\n") | ||||
|     print("  ftype == 0 -> float32") | ||||
|     print("  ftype == 1 -> float16") | ||||
|     sys.exit(1) | ||||
|  | ||||
|  | ||||
| # output in the same directory as the model | ||||
| dir_model = sys.argv[1] | ||||
| fname_out = sys.argv[1] + "/ggml-model.bin" | ||||
|  | ||||
|  | ||||
| # possible tensor data types | ||||
| #   ftype == 0 -> float32 | ||||
| #   ftype == 1 -> float16 | ||||
| # | ||||
| # map from ftype to string | ||||
| ftype_str = ["f32", "f16"] | ||||
|  | ||||
| ftype = 1 | ||||
| if len(sys.argv) > 2: | ||||
|     ftype = int(sys.argv[2]) | ||||
|     if ftype < 0 or ftype > 1: | ||||
|         print("Invalid ftype: " + str(ftype)) | ||||
|         sys.exit(1) | ||||
|     fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".gguf" | ||||
|  | ||||
|  | ||||
| model = AutoModelForCausalLM.from_pretrained(dir_model, low_cpu_mem_usage=True, trust_remote_code=True) | ||||
| list_vars = model.state_dict() | ||||
|  | ||||
| # count tensors to be converted | ||||
| tensor_count = 0 | ||||
| for name in list_vars.keys(): | ||||
|     # we don't need these | ||||
|     if name.endswith(".attention.masked_bias") or name.endswith(".attention.bias") or name.endswith(".attention.rotary_emb.inv_freq"): | ||||
|         continue | ||||
|     tensor_count += 1 | ||||
|  | ||||
| gguf_writer = gguf.GGUFWriter.open(fname_out) | ||||
|  | ||||
| with open(dir_model + "/config.json", "r", encoding="utf-8") as f: | ||||
|     hparams = json.load(f) | ||||
|  | ||||
| # This mmust be changed when adding/deleting kv | ||||
| kv_count = 14 | ||||
|  | ||||
| print("tensors " + str(tensor_count) + " kv " + str(kv_count)) | ||||
|  | ||||
| print("write gguf header") | ||||
|  | ||||
| gguf_writer.write_header(tensor_count, kv_count) | ||||
|  | ||||
| print("write gguf hparams") | ||||
|  | ||||
| llm_arch = "gptneox" | ||||
|  | ||||
| gguf_writer.write_name("pythia-70b-deduped") | ||||
| gguf_writer.write_description("gguf test model") | ||||
| gguf_writer.write_architecture(llm_arch) | ||||
| gguf_writer.write_context_length(llm_arch, hparams["max_position_embeddings"]) | ||||
| gguf_writer.write_embedding_length(llm_arch, hparams["hidden_size"]) | ||||
| gguf_writer.write_layer_count(llm_arch, hparams["num_hidden_layers"]) | ||||
| gguf_writer.write_feed_forward_length(llm_arch, hparams["intermediate_size"]) | ||||
| gguf_writer.write_rope_dimension_count(llm_arch, int( hparams["rotary_pct"]*(hparams["hidden_size"]//hparams["num_attention_heads"])) ) | ||||
| gguf_writer.write_head_count(llm_arch, hparams["num_attention_heads"]) | ||||
| gguf_writer.write_parallel_residual(llm_arch, hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True) | ||||
| gguf_writer.write_layer_norm_eps(llm_arch, hparams["layer_norm_eps"]) | ||||
|  | ||||
| # TOKENIZATION | ||||
|  | ||||
| print("write gguf tokenizer") | ||||
|  | ||||
| tokens: List[str] = [] | ||||
| merges: List[str] = [] | ||||
|  | ||||
| if Path(dir_model + "/tokenizer.json").is_file(): | ||||
|     # vocab type gpt2 | ||||
|     print("Adding gpt2 tokenizer vocab") | ||||
|  | ||||
|     with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f: | ||||
|         tokenizer = json.load(f) | ||||
|  | ||||
|     for key in tokenizer["model"]["vocab"]: | ||||
|         tokens.append(key) | ||||
|  | ||||
|     merges = tokenizer["model"]["merges"] | ||||
|  | ||||
| gguf_writer.write_tokenizer_model("gpt2") | ||||
| gguf_writer.write_token_list(tokens) | ||||
| gguf_writer.write_token_merges(merges) | ||||
|  | ||||
| # TENSORS | ||||
|  | ||||
| # tensor info | ||||
| print("write gguf tensor info") | ||||
|  | ||||
| for name in list_vars.keys(): | ||||
|     data = list_vars[name].squeeze().numpy() | ||||
|  | ||||
|     # we don't need these | ||||
|     if name.endswith(".attention.masked_bias") or name.endswith(".attention.bias") or name.endswith(".attention.rotary_emb.inv_freq"): | ||||
|         continue | ||||
|  | ||||
|     n_dims = len(data.shape) | ||||
|  | ||||
|     # ftype == 0 -> float32, ftype == 1 -> float16 | ||||
|     ftype_cur = 0 | ||||
|     if ftype != 0: | ||||
|         if name.endswith(".weight") and n_dims == 2: | ||||
|             data = data.astype(np.float16) | ||||
|             ftype_cur = 1 | ||||
|         else: | ||||
|             data = data.astype(np.float32) | ||||
|             ftype_cur = 0 | ||||
|     else: | ||||
|         if data.dtype != np.float32: | ||||
|             data = data.astype(np.float32) | ||||
|             ftype_cur = 0 | ||||
|  | ||||
|     gguf_writer.write_tensor_info(name, data) | ||||
|  | ||||
|  | ||||
| # tensor data | ||||
| print("write gguf tensor data") | ||||
|  | ||||
| for name in list_vars.keys(): | ||||
|     data = list_vars[name].squeeze().numpy() | ||||
|     print("Process tensor: " + name + " with shape: ", data.shape) | ||||
|  | ||||
|     # we don't need these | ||||
|     if name.endswith(".attention.masked_bias") or name.endswith(".attention.bias") or name.endswith(".attention.rotary_emb.inv_freq"): | ||||
|         print("  Skip tensor: " + name) | ||||
|         continue | ||||
|  | ||||
|     n_dims = len(data.shape) | ||||
|  | ||||
|     # ftype == 0 -> float32, ftype == 1 -> float16 | ||||
|     ftype_cur = 0 | ||||
|     if ftype != 0: | ||||
|         if name.endswith(".weight") and n_dims == 2: | ||||
|             print("  Converting to float16") | ||||
|             data = data.astype(np.float16) | ||||
|             ftype_cur = 1 | ||||
|         else: | ||||
|             print("  Converting to float32") | ||||
|             data = data.astype(np.float32) | ||||
|             ftype_cur = 0 | ||||
|     else: | ||||
|         if data.dtype != np.float32: | ||||
|             print("  Converting to float32") | ||||
|             data = data.astype(np.float32) | ||||
|             ftype_cur = 0 | ||||
|  | ||||
|     gguf_writer.write_tensor(data) | ||||
|  | ||||
| gguf_writer.close() | ||||
|  | ||||
|  | ||||
| print("Done. Output file: " + fname_out) | ||||
| print("") | ||||
							
								
								
									
										601
									
								
								gptneox-common.cpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										601
									
								
								gptneox-common.cpp
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,601 @@ | ||||
| #include "gptneox-common.h" | ||||
|  | ||||
| #include <cmath> | ||||
| #include <cstring> | ||||
| #include <fstream> | ||||
| #include <regex> | ||||
| #include <locale> | ||||
| #include <codecvt> | ||||
| #include <sstream> | ||||
|  | ||||
| #if defined(_MSC_VER) | ||||
| #pragma warning(disable: 4244 4267) // possible loss of data | ||||
| #endif | ||||
|  | ||||
| // Function to check if the next argument exists | ||||
| std::string get_next_arg(int& i, int argc, char** argv, const std::string& flag, gpt_params& params) { | ||||
|     if (i + 1 < argc && argv[i + 1][0] != '-') { | ||||
|         return argv[++i]; | ||||
|     } else { | ||||
|         fprintf(stderr, "error: %s requires one argument.\n", flag.c_str()); | ||||
|         gpt_print_usage(argc, argv, params); | ||||
|         exit(0); | ||||
|     } | ||||
| } | ||||
|  | ||||
| bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { | ||||
|     for (int i = 1; i < argc; i++) { | ||||
|         std::string arg = argv[i]; | ||||
|  | ||||
|         if (arg == "-s" || arg == "--seed") { | ||||
|             params.seed = std::stoi(get_next_arg(i, argc, argv, arg, params)); | ||||
|         } else if (arg == "-t" || arg == "--threads") { | ||||
|             params.n_threads = std::stoi(get_next_arg(i, argc, argv, arg, params)); | ||||
|         } else if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") { | ||||
|             params.n_gpu_layers = std::stoi(get_next_arg(i, argc, argv, arg, params)); | ||||
|         } else if (arg == "-p" || arg == "--prompt") { | ||||
|             params.prompt = get_next_arg(i, argc, argv, arg, params); | ||||
|         } else if (arg == "-n" || arg == "--n_predict") { | ||||
|             params.n_predict = std::stoi(get_next_arg(i, argc, argv, arg, params)); | ||||
|         } else if (arg == "--top_k") { | ||||
|             params.top_k = std::stoi(get_next_arg(i, argc, argv, arg, params)); | ||||
|         } else if (arg == "--top_p") { | ||||
|             params.top_p = std::stof(get_next_arg(i, argc, argv, arg, params)); | ||||
|         } else if (arg == "--temp") { | ||||
|             params.temp = std::stof(get_next_arg(i, argc, argv, arg, params)); | ||||
|         } else if (arg == "--repeat-last-n") { | ||||
|             params.repeat_last_n = std::stoi(get_next_arg(i, argc, argv, arg, params)); | ||||
|         } else if (arg == "--repeat-penalty") { | ||||
|             params.repeat_penalty = std::stof(get_next_arg(i, argc, argv, arg, params)); | ||||
|         } else if (arg == "-b" || arg == "--batch_size") { | ||||
|             params.n_batch= std::stoi(get_next_arg(i, argc, argv, arg, params)); | ||||
|         } else if (arg == "-m" || arg == "--model") { | ||||
|             params.model = get_next_arg(i, argc, argv, arg, params); | ||||
|         } else if (arg == "-i" || arg == "--interactive") { | ||||
|             params.interactive = true; | ||||
|         } else if (arg == "-ip" || arg == "--interactive-port") { | ||||
|             params.interactive = true; | ||||
|             params.interactive_port = std::stoi(get_next_arg(i, argc, argv, arg, params)); | ||||
|         } else if (arg == "-h" || arg == "--help") { | ||||
|             gpt_print_usage(argc, argv, params); | ||||
|             exit(0); | ||||
|         } else if (arg == "-f" || arg == "--file") { | ||||
|             get_next_arg(i, argc, argv, arg, params); | ||||
|             std::ifstream file(argv[i]); | ||||
|             if (!file) { | ||||
|                 fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); | ||||
|                 break; | ||||
|             } | ||||
|             std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt)); | ||||
|             if (params.prompt.back() == '\n') { | ||||
|                 params.prompt.pop_back(); | ||||
|             } | ||||
|         } else if (arg == "-tt" || arg == "--token_test") { | ||||
|             params.token_test = get_next_arg(i, argc, argv, arg, params); | ||||
|         } | ||||
|         else { | ||||
|             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); | ||||
|             gpt_print_usage(argc, argv, params); | ||||
|             exit(0); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     return true; | ||||
| } | ||||
|  | ||||
| void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { | ||||
|     fprintf(stderr, "usage: %s [options]\n", argv[0]); | ||||
|     fprintf(stderr, "\n"); | ||||
|     fprintf(stderr, "options:\n"); | ||||
|     fprintf(stderr, "  -h, --help            show this help message and exit\n"); | ||||
|     fprintf(stderr, "  -s SEED, --seed SEED  RNG seed (default: -1)\n"); | ||||
|     fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads); | ||||
|     fprintf(stderr, "  -ngl N, --gpu-layers N  number of layers to offload to GPU on supported models (default: %d)\n", params.n_gpu_layers); | ||||
|     fprintf(stderr, "  -p PROMPT, --prompt PROMPT\n"); | ||||
|     fprintf(stderr, "                        prompt to start generation with (default: random)\n"); | ||||
|     fprintf(stderr, "  -f FNAME, --file FNAME\n"); | ||||
|     fprintf(stderr, "                        load prompt from a file\n"); | ||||
|     fprintf(stderr, "  -tt TOKEN_TEST, --token_test TOKEN_TEST\n"); | ||||
|     fprintf(stderr, "                        test tokenization\n"); | ||||
|     fprintf(stderr, "  -n N, --n_predict N   number of tokens to predict (default: %d)\n", params.n_predict); | ||||
|     fprintf(stderr, "  --top_k N             top-k sampling (default: %d)\n", params.top_k); | ||||
|     fprintf(stderr, "  --top_p N             top-p sampling (default: %.1f)\n", params.top_p); | ||||
|     fprintf(stderr, "  --temp N              temperature (default: %.1f)\n", params.temp); | ||||
|     fprintf(stderr, "  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled)\n", params.repeat_last_n); | ||||
|     fprintf(stderr, "  --repeat-penalty N    penalize repeat sequence of tokens (default: %.2f, 1.0 = disabled)\n", (double)params.repeat_penalty); | ||||
|     fprintf(stderr, "  -b N, --batch_size N  batch size for prompt processing (default: %d)\n", params.n_batch); | ||||
|     fprintf(stderr, "  -m FNAME, --model FNAME\n"); | ||||
|     fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str()); | ||||
|     fprintf(stderr, "\n"); | ||||
| } | ||||
|  | ||||
| std::string gpt_random_prompt(std::mt19937 & rng) { | ||||
|     const int r = rng() % 10; | ||||
|     switch (r) { | ||||
|         case 0: return "So"; | ||||
|         case 1: return "Once upon a time"; | ||||
|         case 2: return "When"; | ||||
|         case 3: return "The"; | ||||
|         case 4: return "After"; | ||||
|         case 5: return "If"; | ||||
|         case 6: return "import"; | ||||
|         case 7: return "He"; | ||||
|         case 8: return "She"; | ||||
|         case 9: return "They"; | ||||
|         default: return "To"; | ||||
|     } | ||||
|  | ||||
|     return "The"; | ||||
| } | ||||
|  | ||||
| std::string trim(const std::string & s) { | ||||
|     std::regex e("^\\s+|\\s+$"); | ||||
|     return std::regex_replace(s, e, ""); | ||||
| } | ||||
|  | ||||
| std::string replace(const std::string & s, const std::string & from, const std::string & to) { | ||||
|     std::string result = s; | ||||
|     size_t pos = 0; | ||||
|     while ((pos = result.find(from, pos)) != std::string::npos) { | ||||
|         result.replace(pos, from.length(), to); | ||||
|         pos += to.length(); | ||||
|     } | ||||
|     return result; | ||||
| } | ||||
|  | ||||
| void gpt_vocab::add_special_token(const std::string & token) { | ||||
|     special_tokens.push_back(token); | ||||
| } | ||||
|  | ||||
| std::map<std::string, int32_t> json_parse(const std::string & fname) { | ||||
|     std::map<std::string, int32_t> result; | ||||
|  | ||||
|     // read file into string | ||||
|     std::string json; | ||||
|     { | ||||
|         std::ifstream ifs(fname); | ||||
|         if (!ifs) { | ||||
|             fprintf(stderr, "Failed to open %s\n", fname.c_str()); | ||||
|             exit(1); | ||||
|         } | ||||
|  | ||||
|         json = std::string((std::istreambuf_iterator<char>(ifs)), | ||||
|                 (std::istreambuf_iterator<char>())); | ||||
|     } | ||||
|  | ||||
|     if (json[0] != '{') { | ||||
|         return result; | ||||
|     } | ||||
|  | ||||
|     // parse json | ||||
|     { | ||||
|         bool has_key  = false; | ||||
|         bool in_token = false; | ||||
|  | ||||
|         std::string str_key = ""; | ||||
|         std::string str_val = ""; | ||||
|  | ||||
|         int n = json.size(); | ||||
|         for (int i = 1; i < n; ++i) { | ||||
|             if (!in_token) { | ||||
|                 if (json[i] == ' ') continue; | ||||
|                 if (json[i] == '"') { | ||||
|                     in_token = true; | ||||
|                     continue; | ||||
|                 } | ||||
|             } else { | ||||
|                 if (json[i] == '\\' && i+1 < n) { | ||||
|                     if (has_key == false) { | ||||
|                         str_key += json[i]; | ||||
|                     } else { | ||||
|                         str_val += json[i]; | ||||
|                     } | ||||
|                     ++i; | ||||
|                 } else if (json[i] == '"') { | ||||
|                     if (has_key == false) { | ||||
|                         has_key = true; | ||||
|                         ++i; | ||||
|                         while (json[i] == ' ') ++i; | ||||
|                         ++i; // : | ||||
|                         while (json[i] == ' ') ++i; | ||||
|                         if (json[i] != '\"') { | ||||
|                             while (json[i] != ',' && json[i] != '}') { | ||||
|                                 str_val += json[i++]; | ||||
|                             } | ||||
|                             has_key = false; | ||||
|                         } else { | ||||
|                             in_token = true; | ||||
|                             continue; | ||||
|                         } | ||||
|                     } else { | ||||
|                         has_key = false; | ||||
|                     } | ||||
|  | ||||
|                     str_key = ::replace(str_key, "\\u0120", " " ); // \u0120 -> space | ||||
|                     str_key = ::replace(str_key, "\\u010a", "\n"); // \u010a -> new line | ||||
|                     str_key = ::replace(str_key, "\\\"",    "\""); // \\\"   -> " | ||||
|  | ||||
|                     try { | ||||
|                         result[str_key] = std::stoi(str_val); | ||||
|                     } catch (...) { | ||||
|                         //fprintf(stderr, "%s: ignoring key '%s' with value '%s'\n", fname.c_str(), str_key.c_str(), str_val.c_str()); | ||||
|  | ||||
|                     } | ||||
|                     str_key = ""; | ||||
|                     str_val = ""; | ||||
|                     in_token = false; | ||||
|                     continue; | ||||
|                 } | ||||
|                 if (has_key == false) { | ||||
|                     str_key += json[i]; | ||||
|                 } else { | ||||
|                     str_val += json[i]; | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     return result; | ||||
| } | ||||
|  | ||||
| std::string convert_to_utf8(const std::wstring & input) { | ||||
|     std::wstring_convert<std::codecvt_utf8<wchar_t>> converter; | ||||
|     return converter.to_bytes(input); | ||||
| } | ||||
|  | ||||
|  | ||||
| std::wstring convert_to_wstring(const std::string & input) { | ||||
|     std::wstring_convert<std::codecvt_utf8<wchar_t>> converter; | ||||
|     return converter.from_bytes(input); | ||||
| } | ||||
|  | ||||
| void gpt_split_words(std::string str, std::vector<std::string>& words) { | ||||
|     const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"; | ||||
|     const std::regex re(pattern); | ||||
|     std::smatch m; | ||||
|  | ||||
|     while (std::regex_search(str, m, re)) { | ||||
|         for (auto x : m) { | ||||
|             words.push_back(x); | ||||
|         } | ||||
|         str = m.suffix(); | ||||
|     } | ||||
| } | ||||
|  | ||||
| std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) { | ||||
|     std::vector<std::string> words; | ||||
|  | ||||
|     // first split the text into words | ||||
|     { | ||||
|         std::string str = text; | ||||
|  | ||||
|         // Generate the subpattern from the special_tokens vector if it's not empty | ||||
|         if (!vocab.special_tokens.empty()) { | ||||
|             const std::regex escape(R"([\[\\\^\$\.\|\?\*\+\(\)\{\}])"); | ||||
|             std::string special_tokens_subpattern; | ||||
|             for (const auto & token : vocab.special_tokens) { | ||||
|                 if (!special_tokens_subpattern.empty()) { | ||||
|                     special_tokens_subpattern += "|"; | ||||
|                 } | ||||
|                 special_tokens_subpattern += std::regex_replace(token, escape, R"(\$&)"); | ||||
|             } | ||||
|  | ||||
|             std::regex re(special_tokens_subpattern); | ||||
|             std::smatch m; | ||||
|             // Split the text by special tokens. | ||||
|             while (std::regex_search(str, m, re)) { | ||||
|                 // Split the substrings in-between special tokens into words. | ||||
|                 gpt_split_words(m.prefix(), words); | ||||
|                 // Add matched special tokens as words. | ||||
|                 for (auto x : m) { | ||||
|                     words.push_back(x); | ||||
|                 } | ||||
|                 str = m.suffix(); | ||||
|             } | ||||
|             // Remaining text without special tokens will be handled below. | ||||
|         } | ||||
|  | ||||
|         gpt_split_words(str, words); | ||||
|     } | ||||
|  | ||||
|     // find the longest token that forms each word in words: | ||||
|     std::vector<gpt_vocab::id> tokens; | ||||
|     for (const auto & word : words) { | ||||
|         for (int i = 0; i < (int) word.size(); ){ | ||||
|             for (int j = word.size() - 1; j >= i; j--){ | ||||
|                 auto cand = word.substr(i, j-i+1); | ||||
|                 auto it = vocab.token_to_id.find(cand); | ||||
|                 if (it != vocab.token_to_id.end()){ // word.substr(i, j-i+1) in vocab | ||||
|                     tokens.push_back(it->second); | ||||
|                     i = j + 1; | ||||
|                     break; | ||||
|                 } | ||||
|                 else if (j == i){ // word.substr(i, 1) has no matching | ||||
|                     fprintf(stderr, "%s: unknown token '%s'\n", __func__, word.substr(i, 1).data()); | ||||
|                     i++; | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     return tokens; | ||||
| } | ||||
|  | ||||
| std::vector<gpt_vocab::id> parse_tokens_from_string(const std::string& input, char delimiter) { | ||||
|     std::vector<gpt_vocab::id> output; | ||||
|     std::stringstream ss(input); | ||||
|     std::string token; | ||||
|  | ||||
|     while (std::getline(ss, token, delimiter)) { | ||||
|         output.push_back(std::stoi(token)); | ||||
|     } | ||||
|  | ||||
|     return output; | ||||
| } | ||||
|  | ||||
| std::map<std::string, std::vector<gpt_vocab::id>> extract_tests_from_file(const std::string & fpath_test){ | ||||
|     if (fpath_test.empty()){ | ||||
|         fprintf(stderr, "%s : No test file found.\n", __func__); | ||||
|         return std::map<std::string, std::vector<gpt_vocab::id>>(); | ||||
|     } | ||||
|  | ||||
|     std::map<std::string, std::vector<gpt_vocab::id>> tests; | ||||
|  | ||||
|     auto fin = std::ifstream(fpath_test, std::ios_base::in); | ||||
|     const char * delimeter = " => "; | ||||
|     const char del_tok = ','; | ||||
|     std::string line; | ||||
|     while (std::getline(fin, line)) { | ||||
|         size_t delimiterPos = line.find(delimeter); | ||||
|         if (delimiterPos != std::string::npos) { | ||||
|             std::string text = line.substr(0, delimiterPos); | ||||
|             std::string s_tokens = line.substr(delimiterPos + std::strlen(delimeter)); | ||||
|             tests[text] = parse_tokens_from_string(s_tokens, del_tok); | ||||
|         } | ||||
|     } | ||||
|     return tests; | ||||
| } | ||||
|  | ||||
| void test_gpt_tokenizer(gpt_vocab & vocab, const std::string & fpath_test){ | ||||
|     std::map<std::string, std::vector<gpt_vocab::id>> tests = extract_tests_from_file(fpath_test); | ||||
|  | ||||
|     size_t n_fails = 0; | ||||
|  | ||||
|     for (const auto & test : tests) { | ||||
|         std::vector<gpt_vocab::id> tokens = gpt_tokenize(vocab, test.first); | ||||
|  | ||||
|         if (tokens != test.second){ | ||||
|             n_fails++; | ||||
|  | ||||
|             // print out failure cases | ||||
|             fprintf(stderr, "%s : failed test: '%s'\n", __func__, test.first.c_str()); | ||||
|             fprintf(stderr, "%s : tokens in hf:   ", __func__); | ||||
|             for (const auto & t : test.second) { | ||||
|                 fprintf(stderr, "%s(%d), ", vocab.id_to_token[t].c_str(), t); | ||||
|             } | ||||
|             fprintf(stderr, "\n"); | ||||
|             fprintf(stderr, "%s : tokens in ggml: ", __func__); | ||||
|             for (const auto & t : tokens) { | ||||
|                 fprintf(stderr, "%s(%d), ", vocab.id_to_token[t].c_str(), t); | ||||
|             } | ||||
|             fprintf(stderr, "\n"); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     fprintf(stderr, "%s : %zu tests failed out of %zu tests.\n", __func__, n_fails, tests.size()); | ||||
| } | ||||
|  | ||||
| bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) { | ||||
|     printf("%s: loading vocab from '%s'\n", __func__, fname.c_str()); | ||||
|  | ||||
|     vocab.token_to_id = ::json_parse(fname); | ||||
|  | ||||
|     for (const auto & kv : vocab.token_to_id) { | ||||
|         vocab.id_to_token[kv.second] = kv.first; | ||||
|     } | ||||
|  | ||||
|     printf("%s: vocab size = %d\n", __func__, (int) vocab.token_to_id.size()); | ||||
|  | ||||
|     // print the vocabulary | ||||
|     //for (auto kv : vocab.token_to_id) { | ||||
|     //    printf("'%s' -> %d\n", kv.first.data(), kv.second); | ||||
|     //} | ||||
|  | ||||
|     return true; | ||||
| } | ||||
|  | ||||
| gpt_vocab::id gpt_sample_top_k_top_p( | ||||
|         const gpt_vocab & vocab, | ||||
|         const float * logits, | ||||
|         int    top_k, | ||||
|         double top_p, | ||||
|         double temp, | ||||
|         std::mt19937 & rng) { | ||||
|     int n_logits = vocab.id_to_token.size(); | ||||
|  | ||||
|     std::vector<std::pair<double, gpt_vocab::id>> logits_id; | ||||
|     logits_id.reserve(n_logits); | ||||
|  | ||||
|     { | ||||
|         const double scale = 1.0/temp; | ||||
|         for (int i = 0; i < n_logits; ++i) { | ||||
|             logits_id.push_back(std::make_pair(logits[i]*scale, i)); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     // find the top K tokens | ||||
|     std::partial_sort( | ||||
|             logits_id.begin(), | ||||
|             logits_id.begin() + top_k, logits_id.end(), | ||||
|             [](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) { | ||||
|         return a.first > b.first; | ||||
|     }); | ||||
|  | ||||
|     logits_id.resize(top_k); | ||||
|  | ||||
|     double maxl = -INFINITY; | ||||
|     for (const auto & kv : logits_id) { | ||||
|         maxl = std::max(maxl, kv.first); | ||||
|     } | ||||
|  | ||||
|     // compute probs for the top K tokens | ||||
|     std::vector<double> probs; | ||||
|     probs.reserve(logits_id.size()); | ||||
|  | ||||
|     double sum = 0.0; | ||||
|     for (const auto & kv : logits_id) { | ||||
|         double p = exp(kv.first - maxl); | ||||
|         probs.push_back(p); | ||||
|         sum += p; | ||||
|     } | ||||
|  | ||||
|     // normalize the probs | ||||
|     for (auto & p : probs) { | ||||
|         p /= sum; | ||||
|     } | ||||
|  | ||||
|     if (top_p < 1.0f) { | ||||
|         double cumsum = 0.0f; | ||||
|         for (int i = 0; i < top_k; i++) { | ||||
|             cumsum += probs[i]; | ||||
|             if (cumsum >= top_p) { | ||||
|                 top_k = i + 1; | ||||
|                 probs.resize(top_k); | ||||
|                 logits_id.resize(top_k); | ||||
|                 break; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         cumsum = 1.0/cumsum; | ||||
|         for (int i = 0; i < (int) probs.size(); i++) { | ||||
|             probs[i] *= cumsum; | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     //printf("\n"); | ||||
|     //for (int i = 0; i < (int) probs.size(); i++) { | ||||
|     //    printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]); | ||||
|     //} | ||||
|     //exit(0); | ||||
|  | ||||
|     std::discrete_distribution<> dist(probs.begin(), probs.end()); | ||||
|     int idx = dist(rng); | ||||
|  | ||||
|     return logits_id[idx].second; | ||||
| } | ||||
|  | ||||
| gpt_vocab::id gpt_sample_top_k_top_p_repeat( | ||||
|         const gpt_vocab & vocab, | ||||
|         const float * logits, | ||||
|         const int32_t * last_n_tokens_data, | ||||
|         size_t last_n_tokens_data_size, | ||||
|         int    top_k, | ||||
|         double top_p, | ||||
|         double temp, | ||||
|         int repeat_last_n, | ||||
|         float repeat_penalty, | ||||
|         std::mt19937 & rng) { | ||||
|  | ||||
|     int n_logits = vocab.id_to_token.size(); | ||||
|  | ||||
|     const auto * plogits = logits; | ||||
|  | ||||
|     const auto last_n_tokens = std::vector<int32_t>(last_n_tokens_data, last_n_tokens_data + last_n_tokens_data_size); | ||||
|  | ||||
|     if (temp <= 0) { | ||||
|         // select the token with the highest logit directly | ||||
|         float max_logit = plogits[0]; | ||||
|         gpt_vocab::id max_id = 0; | ||||
|  | ||||
|         for (int i = 1; i < n_logits; ++i) { | ||||
|             if (plogits[i] > max_logit) { | ||||
|                 max_logit = plogits[i]; | ||||
|                 max_id = i; | ||||
|             } | ||||
|         } | ||||
|         return max_id; | ||||
|     } | ||||
|  | ||||
|  | ||||
|     std::vector<std::pair<double, gpt_vocab::id>> logits_id; | ||||
|     logits_id.reserve(n_logits); | ||||
|  | ||||
|     { | ||||
|         const float scale = 1.0f/temp; | ||||
|         for (int i = 0; i < n_logits; ++i) { | ||||
|             // repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858) | ||||
|             // credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main | ||||
|             if (repeat_last_n > 0 && std::find(last_n_tokens.end()-repeat_last_n, last_n_tokens.end(), i) != last_n_tokens.end()) { | ||||
|                 // if score < 0 then repetition penalty has to multiplied to reduce the previous token probability | ||||
|                 if (plogits[i] < 0.0f) { | ||||
|                     logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i)); | ||||
|                 } else { | ||||
|                     logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i)); | ||||
|                 } | ||||
|             } else { | ||||
|                 logits_id.push_back(std::make_pair(plogits[i]*scale, i)); | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     // find the top K tokens | ||||
|     std::partial_sort( | ||||
|             logits_id.begin(), | ||||
|             logits_id.begin() + top_k, logits_id.end(), | ||||
|             [](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) { | ||||
|         return a.first > b.first; | ||||
|     }); | ||||
|  | ||||
|     logits_id.resize(top_k); | ||||
|  | ||||
|     double maxl = -INFINITY; | ||||
|     for (const auto & kv : logits_id) { | ||||
|         maxl = std::max(maxl, kv.first); | ||||
|     } | ||||
|  | ||||
|     // compute probs for the top K tokens | ||||
|     std::vector<double> probs; | ||||
|     probs.reserve(logits_id.size()); | ||||
|  | ||||
|     double sum = 0.0; | ||||
|     for (const auto & kv : logits_id) { | ||||
|         double p = exp(kv.first - maxl); | ||||
|         probs.push_back(p); | ||||
|         sum += p; | ||||
|     } | ||||
|  | ||||
|     // normalize the probs | ||||
|     for (auto & p : probs) { | ||||
|         p /= sum; | ||||
|     } | ||||
|  | ||||
|     if (top_p < 1.0f) { | ||||
|         double cumsum = 0.0f; | ||||
|         for (int i = 0; i < top_k; i++) { | ||||
|             cumsum += probs[i]; | ||||
|             if (cumsum >= top_p) { | ||||
|                 top_k = i + 1; | ||||
|                 probs.resize(top_k); | ||||
|                 logits_id.resize(top_k); | ||||
|                 break; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         cumsum = 1.0/cumsum; | ||||
|         for (int i = 0; i < (int) probs.size(); i++) { | ||||
|             probs[i] *= cumsum; | ||||
|         } | ||||
|     } | ||||
|  | ||||
| //    printf("\n"); | ||||
| //    for (int i = 0; i < (int) probs.size(); i++) { | ||||
| //    for (int i = 0; i < 10; i++) { | ||||
| //        printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]); | ||||
| //    } | ||||
|  | ||||
|     std::discrete_distribution<> dist(probs.begin(), probs.end()); | ||||
|     int idx = dist(rng); | ||||
|  | ||||
|     return logits_id[idx].second; | ||||
|  | ||||
| } | ||||
							
								
								
									
										125
									
								
								gptneox-common.h
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										125
									
								
								gptneox-common.h
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,125 @@ | ||||
| // Various helper functions and utilities | ||||
|  | ||||
| #pragma once | ||||
|  | ||||
| #include <string> | ||||
| #include <map> | ||||
| #include <vector> | ||||
| #include <random> | ||||
| #include <thread> | ||||
|  | ||||
| // | ||||
| // CLI argument parsing | ||||
| // | ||||
|  | ||||
| struct gpt_params { | ||||
|     int32_t seed      = -1;  // RNG seed | ||||
|     int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); | ||||
|     int32_t n_predict = 200; // new tokens to predict | ||||
|     int32_t n_batch   = 8;   // batch size for prompt processing | ||||
|  | ||||
|     // sampling parameters | ||||
|     int32_t top_k          = 40; | ||||
|     float   top_p          = 0.9f; | ||||
|     float   temp           = 0.9f; | ||||
|     int32_t repeat_last_n  = 64; | ||||
|     float   repeat_penalty = 1.00f; | ||||
|  | ||||
|     std::string model      = "models/gpt-2-117M/ggml-model.bin"; // model path | ||||
|     std::string prompt     = ""; | ||||
|     std::string token_test = ""; | ||||
|  | ||||
|     bool    interactive      = false; | ||||
|     int32_t interactive_port = -1; | ||||
|  | ||||
|     int32_t n_gpu_layers     = 0; | ||||
| }; | ||||
|  | ||||
| bool gpt_params_parse(int argc, char ** argv, gpt_params & params); | ||||
|  | ||||
| void gpt_print_usage(int argc, char ** argv, const gpt_params & params); | ||||
|  | ||||
| std::string gpt_random_prompt(std::mt19937 & rng); | ||||
|  | ||||
| // | ||||
| // Vocab utils | ||||
| // | ||||
|  | ||||
| std::string trim(const std::string & s); | ||||
|  | ||||
| std::string replace( | ||||
|         const std::string & s, | ||||
|         const std::string & from, | ||||
|         const std::string & to); | ||||
|  | ||||
| struct gpt_vocab { | ||||
|     using id    = int32_t; | ||||
|     using token = std::string; | ||||
|  | ||||
|     std::map<token, id> token_to_id; | ||||
|     std::map<id, token> id_to_token; | ||||
|     std::vector<std::string> special_tokens; | ||||
|  | ||||
|     void add_special_token(const std::string & token); | ||||
| }; | ||||
|  | ||||
| // poor-man's JSON parsing | ||||
| std::map<std::string, int32_t> json_parse(const std::string & fname); | ||||
|  | ||||
| std::string convert_to_utf8(const std::wstring & input); | ||||
|  | ||||
| std::wstring convert_to_wstring(const std::string & input); | ||||
|  | ||||
| void gpt_split_words(std::string str, std::vector<std::string>& words); | ||||
|  | ||||
| // split text into tokens | ||||
| // | ||||
| // ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53 | ||||
| // | ||||
| // Regex (Python): | ||||
| // r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""" | ||||
| // | ||||
| // Regex (C++): | ||||
| // R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)" | ||||
| // | ||||
| std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text); | ||||
|  | ||||
| // test outputs of gpt_tokenize | ||||
| // | ||||
| //   - compare with tokens generated by the huggingface tokenizer | ||||
| //   - test cases are chosen based on the model's main language (under 'prompt' directory) | ||||
| //   - if all sentences are tokenized identically, print 'All tests passed.' | ||||
| //   - otherwise, print sentence, huggingface tokens, ggml tokens | ||||
| // | ||||
| void test_gpt_tokenizer(gpt_vocab & vocab, const std::string & fpath_test); | ||||
|  | ||||
| // load the tokens from encoder.json | ||||
| bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab); | ||||
|  | ||||
| // sample next token given probabilities for each embedding | ||||
| // | ||||
| //   - consider only the top K tokens | ||||
| //   - from them, consider only the top tokens with cumulative probability > P | ||||
| // | ||||
| // TODO: not sure if this implementation is correct | ||||
| // TODO: temperature is not implemented | ||||
| // | ||||
| gpt_vocab::id gpt_sample_top_k_top_p( | ||||
|         const gpt_vocab & vocab, | ||||
|         const float * logits, | ||||
|         int    top_k, | ||||
|         double top_p, | ||||
|         double temp, | ||||
|         std::mt19937 & rng); | ||||
|  | ||||
| gpt_vocab::id gpt_sample_top_k_top_p_repeat( | ||||
|         const gpt_vocab & vocab, | ||||
|         const float * logits, | ||||
|         const int32_t * last_n_tokens_data, | ||||
|         size_t last_n_tokens_data_size, | ||||
|         int    top_k, | ||||
|         double top_p, | ||||
|         double temp, | ||||
|         int repeat_last_n, | ||||
|         float repeat_penalty, | ||||
|         std::mt19937 & rng); | ||||
		Reference in New Issue
	
	Block a user
	 klosax
					klosax