mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-29 08:41:22 +00:00 
			
		
		
		
	llama : add llama_vocab, functions -> methods, naming (#11110)
				
					
				
			* llama : functions -> methods (#11110) * llama : add struct llama_vocab to the API (#11156) ggml-ci * hparams : move vocab params to llama_vocab (#11159) ggml-ci * vocab : more pimpl (#11165) ggml-ci * vocab : minor tokenization optimizations (#11160) ggml-ci Co-authored-by: Diego Devesa <slarengh@gmail.com> * lora : update API names (#11167) ggml-ci * llama : update API names to use correct prefix (#11174) * llama : update API names to use correct prefix ggml-ci * cont ggml-ci * cont ggml-ci * minor [no ci] * vocab : llama_vocab_add_[be]os -> llama_vocab_get_add_[be]os (#11174) ggml-ci * vocab : llama_vocab_n_vocab -> llama_vocab_n_tokens (#11174) ggml-ci --------- Co-authored-by: Diego Devesa <slarengh@gmail.com>
This commit is contained in:
		| @@ -98,7 +98,7 @@ struct slot_params { | ||||
|     int64_t t_max_prompt_ms  = -1; // TODO: implement | ||||
|     int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit | ||||
|  | ||||
|     std::vector<common_lora_adapter_info> lora; | ||||
|     std::vector<common_adapter_lora_info> lora; | ||||
|  | ||||
|     std::vector<std::string> antiprompt; | ||||
|     std::vector<std::string> response_fields; | ||||
| @@ -198,15 +198,17 @@ struct server_task { | ||||
|     bool metrics_reset_bucket = false; | ||||
|  | ||||
|     // used by SERVER_TASK_TYPE_SET_LORA | ||||
|     std::vector<common_lora_adapter_info> set_lora; | ||||
|     std::vector<common_adapter_lora_info> set_lora; | ||||
|  | ||||
|     server_task(server_task_type type) : type(type) {} | ||||
|  | ||||
|     static slot_params params_from_json_cmpl( | ||||
|             const llama_model * model, | ||||
|             const llama_context * ctx, | ||||
|             const common_params & params_base, | ||||
|             const json & data) { | ||||
|         const llama_model * model = llama_get_model(ctx); | ||||
|         const llama_vocab * vocab = llama_model_get_vocab(model); | ||||
|  | ||||
|         slot_params params; | ||||
|  | ||||
|         // Sampling parameter defaults are loaded from the global server context (but individual requests can still override them) | ||||
| @@ -329,7 +331,7 @@ struct server_task { | ||||
|  | ||||
|             const auto & logit_bias = data.find("logit_bias"); | ||||
|             if (logit_bias != data.end() && logit_bias->is_array()) { | ||||
|                 const int n_vocab = llama_n_vocab(model); | ||||
|                 const int n_vocab = llama_vocab_n_tokens(vocab); | ||||
|                 for (const auto & el : *logit_bias) { | ||||
|                     // TODO: we may want to throw errors here, in case "el" is incorrect | ||||
|                     if (el.is_array() && el.size() == 2) { | ||||
| @@ -348,7 +350,7 @@ struct server_task { | ||||
|                                 params.sampling.logit_bias.push_back({tok, bias}); | ||||
|                             } | ||||
|                         } else if (el[0].is_string()) { | ||||
|                             auto toks = common_tokenize(model, el[0].get<std::string>(), false); | ||||
|                             auto toks = common_tokenize(vocab, el[0].get<std::string>(), false); | ||||
|                             for (auto tok : toks) { | ||||
|                                 params.sampling.logit_bias.push_back({tok, bias}); | ||||
|                             } | ||||
| @@ -1131,7 +1133,7 @@ struct server_slot { | ||||
|  | ||||
|     common_speculative * spec = nullptr; | ||||
|  | ||||
|     std::vector<common_lora_adapter_info> lora; | ||||
|     std::vector<common_adapter_lora_info> lora; | ||||
|  | ||||
|     // the index relative to completion multi-task request | ||||
|     size_t index = 0; | ||||
| @@ -1633,6 +1635,8 @@ struct server_context { | ||||
|     llama_model * model = nullptr; | ||||
|     llama_context * ctx = nullptr; | ||||
|  | ||||
|     const llama_vocab * vocab = nullptr; | ||||
|  | ||||
|     llama_model * model_dft = nullptr; | ||||
|  | ||||
|     llama_context_params cparams_dft; | ||||
| @@ -1690,10 +1694,12 @@ struct server_context { | ||||
|             return false; | ||||
|         } | ||||
|  | ||||
|         vocab = llama_model_get_vocab(model); | ||||
|  | ||||
|         n_ctx = llama_n_ctx(ctx); | ||||
|  | ||||
|         add_bos_token = llama_add_bos_token(model); | ||||
|         has_eos_token = llama_token_eos(model) != LLAMA_TOKEN_NULL; | ||||
|         add_bos_token = llama_vocab_get_add_bos(vocab); | ||||
|         has_eos_token = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL; | ||||
|  | ||||
|         if (!params_base.speculative.model.empty()) { | ||||
|             SRV_INF("loading draft model '%s'\n", params_base.speculative.model.c_str()); | ||||
| @@ -1736,7 +1742,8 @@ struct server_context { | ||||
|  | ||||
|     bool validate_builtin_chat_template() const { | ||||
|         llama_chat_message chat[] = {{"user", "test"}}; | ||||
|         int32_t chat_res = llama_chat_apply_template(model, nullptr, chat, 1, true, nullptr, 0); | ||||
|         const char * tmpl = llama_model_chat_template(model); | ||||
|         const int32_t chat_res = llama_chat_apply_template(tmpl, chat, 1, true, nullptr, 0); | ||||
|         return chat_res > 0; | ||||
|     } | ||||
|  | ||||
| @@ -1756,7 +1763,7 @@ struct server_context { | ||||
|             if (model_dft) { | ||||
|                 slot.batch_spec = llama_batch_init(params_base.speculative.n_max + 1, 0, 1); | ||||
|  | ||||
|                 slot.ctx_dft = llama_new_context_with_model(model_dft, cparams_dft); | ||||
|                 slot.ctx_dft = llama_init_from_model(model_dft, cparams_dft); | ||||
|                 if (slot.ctx_dft == nullptr) { | ||||
|                     SRV_ERR("%s", "failed to create draft context\n"); | ||||
|                     return; | ||||
| @@ -1891,7 +1898,7 @@ struct server_context { | ||||
|         } | ||||
|  | ||||
|         if (slot.params.ignore_eos && has_eos_token) { | ||||
|             slot.params.sampling.logit_bias.push_back({llama_token_eos(model), -INFINITY}); | ||||
|             slot.params.sampling.logit_bias.push_back({llama_vocab_eos(vocab), -INFINITY}); | ||||
|         } | ||||
|  | ||||
|         { | ||||
| @@ -2047,14 +2054,14 @@ struct server_context { | ||||
|                     slot.n_decoded, slot.n_prompt_tokens, slot.n_past, slot.n_ctx); | ||||
|         } | ||||
|  | ||||
|         if (llama_token_is_eog(model, result.tok)) { | ||||
|         if (llama_vocab_is_eog(vocab, result.tok)) { | ||||
|             slot.stop           = STOP_TYPE_EOS; | ||||
|             slot.has_next_token = false; | ||||
|  | ||||
|             SLT_DBG(slot, "%s", "stopped by EOS\n"); | ||||
|         } | ||||
|  | ||||
|         const auto n_ctx_train = llama_n_ctx_train(model); | ||||
|         const auto n_ctx_train = llama_model_n_ctx_train(model); | ||||
|  | ||||
|         if (slot.params.n_predict < 1 && slot.n_predict < 1 && slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) { | ||||
|             slot.truncated      = true; | ||||
| @@ -2074,7 +2081,7 @@ struct server_context { | ||||
|  | ||||
|     void populate_token_probs(const server_slot & slot, completion_token_output & result, bool post_sampling, bool special, int idx) { | ||||
|         size_t n_probs = slot.params.sampling.n_probs; | ||||
|         size_t n_vocab = llama_n_vocab(llama_get_model(ctx)); | ||||
|         size_t n_vocab = llama_vocab_n_tokens(vocab); | ||||
|         if (post_sampling) { | ||||
|             const auto * cur_p = common_sampler_get_candidates(slot.smpl); | ||||
|             const size_t max_probs = cur_p->size; | ||||
| @@ -2225,7 +2232,7 @@ struct server_context { | ||||
|         res->n_tokens  = slot.n_prompt_tokens; | ||||
|         res->oaicompat = slot.params.oaicompat; | ||||
|  | ||||
|         const int n_embd = llama_n_embd(model); | ||||
|         const int n_embd = llama_model_n_embd(model); | ||||
|  | ||||
|         std::vector<float> embd_res(n_embd, 0.0f); | ||||
|  | ||||
| @@ -2927,7 +2934,7 @@ struct server_context { | ||||
|             // make sure we're in the right embedding mode | ||||
|             llama_set_embeddings(ctx, slot_batched->is_non_causal()); | ||||
|             // apply lora, only need to do it once per batch | ||||
|             common_lora_adapters_apply(ctx, slot_batched->lora); | ||||
|             common_set_adapter_lora(ctx, slot_batched->lora); | ||||
|         } | ||||
|  | ||||
|         // process the created batch of tokens | ||||
| @@ -3129,12 +3136,12 @@ struct server_context { | ||||
|  | ||||
|     json model_meta() const { | ||||
|         return json { | ||||
|             {"vocab_type",  llama_vocab_type    (model)}, | ||||
|             {"n_vocab",     llama_n_vocab       (model)}, | ||||
|             {"n_ctx_train", llama_n_ctx_train   (model)}, | ||||
|             {"n_embd",      llama_n_embd        (model)}, | ||||
|             {"n_params",    llama_model_n_params(model)}, | ||||
|             {"size",        llama_model_size    (model)}, | ||||
|             {"vocab_type",  llama_vocab_type       (vocab)}, | ||||
|             {"n_vocab",     llama_vocab_n_tokens   (vocab)}, | ||||
|             {"n_ctx_train", llama_model_n_ctx_train(model)}, | ||||
|             {"n_embd",      llama_model_n_embd     (model)}, | ||||
|             {"n_params",    llama_model_n_params   (model)}, | ||||
|             {"size",        llama_model_size       (model)}, | ||||
|         }; | ||||
|     } | ||||
| }; | ||||
| @@ -3639,7 +3646,7 @@ int main(int argc, char ** argv) { | ||||
|         std::vector<server_task> tasks; | ||||
|  | ||||
|         try { | ||||
|             std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, data.at("prompt"), true, true); | ||||
|             std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, data.at("prompt"), true, true); | ||||
|             tasks.reserve(tokenized_prompts.size()); | ||||
|             for (size_t i = 0; i < tokenized_prompts.size(); i++) { | ||||
|                 server_task task = server_task(type); | ||||
| @@ -3649,7 +3656,6 @@ int main(int argc, char ** argv) { | ||||
|  | ||||
|                 task.prompt_tokens    = std::move(tokenized_prompts[i]); | ||||
|                 task.params           = server_task::params_from_json_cmpl( | ||||
|                                             ctx_server.model, | ||||
|                                             ctx_server.ctx, | ||||
|                                             ctx_server.params_base, | ||||
|                                             data); | ||||
| @@ -3745,13 +3751,13 @@ int main(int argc, char ** argv) { | ||||
|     const auto handle_infill = [&ctx_server, &res_error, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) { | ||||
|         // check model compatibility | ||||
|         std::string err; | ||||
|         if (llama_token_fim_pre(ctx_server.model) == LLAMA_TOKEN_NULL) { | ||||
|         if (llama_vocab_fim_pre(ctx_server.vocab) == LLAMA_TOKEN_NULL) { | ||||
|             err += "prefix token is missing. "; | ||||
|         } | ||||
|         if (llama_token_fim_suf(ctx_server.model) == LLAMA_TOKEN_NULL) { | ||||
|         if (llama_vocab_fim_suf(ctx_server.vocab) == LLAMA_TOKEN_NULL) { | ||||
|             err += "suffix token is missing. "; | ||||
|         } | ||||
|         if (llama_token_fim_mid(ctx_server.model) == LLAMA_TOKEN_NULL) { | ||||
|         if (llama_vocab_fim_mid(ctx_server.vocab) == LLAMA_TOKEN_NULL) { | ||||
|             err += "middle token is missing. "; | ||||
|         } | ||||
|         if (!err.empty()) { | ||||
| @@ -3797,10 +3803,10 @@ int main(int argc, char ** argv) { | ||||
|         data["input_extra"] = input_extra; // default to empty array if it's not exist | ||||
|  | ||||
|         std::string prompt = json_value(data, "prompt", std::string()); | ||||
|         std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, prompt, false, true); | ||||
|         std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, false, true); | ||||
|         SRV_DBG("creating infill tasks, n_prompts = %d\n", (int) tokenized_prompts.size()); | ||||
|         data["prompt"] = format_infill( | ||||
|             ctx_server.ctx, | ||||
|             ctx_server.vocab, | ||||
|             data.at("input_prefix"), | ||||
|             data.at("input_suffix"), | ||||
|             data.at("input_extra"), | ||||
| @@ -3857,7 +3863,7 @@ int main(int argc, char ** argv) { | ||||
|             const bool add_special = json_value(body, "add_special", false); | ||||
|             const bool with_pieces = json_value(body, "with_pieces", false); | ||||
|  | ||||
|             llama_tokens tokens = tokenize_mixed(ctx_server.ctx, body.at("content"), add_special, true); | ||||
|             llama_tokens tokens = tokenize_mixed(ctx_server.vocab, body.at("content"), add_special, true); | ||||
|  | ||||
|             if (with_pieces) { | ||||
|                 for (const auto& token : tokens) { | ||||
| @@ -3933,7 +3939,7 @@ int main(int argc, char ** argv) { | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, prompt, true, true); | ||||
|         std::vector<llama_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true); | ||||
|         for (const auto & tokens : tokenized_prompts) { | ||||
|             // this check is necessary for models that do not add BOS token to the input | ||||
|             if (tokens.empty()) { | ||||
| @@ -4033,20 +4039,20 @@ int main(int argc, char ** argv) { | ||||
|             return; | ||||
|         } | ||||
|  | ||||
|         llama_tokens tokenized_query = tokenize_input_prompts(ctx_server.ctx, query, /* add_special */ false, true)[0]; | ||||
|         llama_tokens tokenized_query = tokenize_input_prompts(ctx_server.vocab, query, /* add_special */ false, true)[0]; | ||||
|  | ||||
|         // create and queue the task | ||||
|         json responses = json::array(); | ||||
|         bool error = false; | ||||
|         { | ||||
|             std::vector<server_task> tasks; | ||||
|             std::vector<llama_tokens> tokenized_docs = tokenize_input_prompts(ctx_server.ctx, documents, /* add_special */ false, true); | ||||
|             std::vector<llama_tokens> tokenized_docs = tokenize_input_prompts(ctx_server.vocab, documents, /* add_special */ false, true); | ||||
|             tasks.reserve(tokenized_docs.size()); | ||||
|             for (size_t i = 0; i < tokenized_docs.size(); i++) { | ||||
|                 server_task task   = server_task(SERVER_TASK_TYPE_RERANK); | ||||
|                 task.id            = ctx_server.queue_tasks.get_new_id(); | ||||
|                 task.index         = i; | ||||
|                 task.prompt_tokens = format_rerank(ctx_server.model, tokenized_query, tokenized_docs[i]); | ||||
|                 task.prompt_tokens = format_rerank(ctx_server.vocab, tokenized_query, tokenized_docs[i]); | ||||
|                 tasks.push_back(task); | ||||
|             } | ||||
|  | ||||
|   | ||||
| @@ -118,7 +118,7 @@ static json json_get_nested_values(const std::vector<std::string> & paths, const | ||||
|  * - only string, example: "string" | ||||
|  * - mixed string and tokens, example: [12, 34, "string", 56, 78] | ||||
|  */ | ||||
| static llama_tokens tokenize_mixed(const llama_context * ctx, const json & json_prompt, bool add_special, bool parse_special) { | ||||
| static llama_tokens tokenize_mixed(const llama_vocab * vocab, const json & json_prompt, bool add_special, bool parse_special) { | ||||
|     // If `add_bos` is true, we only add BOS, when json_prompt is a string, | ||||
|     // or the first element of the json_prompt array is a string. | ||||
|     llama_tokens prompt_tokens; | ||||
| @@ -131,10 +131,10 @@ static llama_tokens tokenize_mixed(const llama_context * ctx, const json & json_ | ||||
|  | ||||
|                 llama_tokens p; | ||||
|                 if (first) { | ||||
|                     p = common_tokenize(ctx, s, add_special, parse_special); | ||||
|                     p = common_tokenize(vocab, s, add_special, parse_special); | ||||
|                     first = false; | ||||
|                 } else { | ||||
|                     p = common_tokenize(ctx, s, false, parse_special); | ||||
|                     p = common_tokenize(vocab, s, false, parse_special); | ||||
|                 } | ||||
|  | ||||
|                 prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end()); | ||||
| @@ -148,7 +148,7 @@ static llama_tokens tokenize_mixed(const llama_context * ctx, const json & json_ | ||||
|         } | ||||
|     } else { | ||||
|         auto s = json_prompt.template get<std::string>(); | ||||
|         prompt_tokens = common_tokenize(ctx, s, add_special, parse_special); | ||||
|         prompt_tokens = common_tokenize(vocab, s, add_special, parse_special); | ||||
|     } | ||||
|  | ||||
|     return prompt_tokens; | ||||
| @@ -166,11 +166,11 @@ static llama_tokens tokenize_mixed(const llama_context * ctx, const json & json_ | ||||
|  * - "prompt": [[12, 34, 56], [78, 90, 12]] | ||||
|  * - "prompt": [[12, 34, "string", 56, 78], [12, 34, 56]] | ||||
|  */ | ||||
| static std::vector<llama_tokens> tokenize_input_prompts(llama_context * ctx, const json & json_prompt, bool add_special, bool parse_special) { | ||||
| static std::vector<llama_tokens> tokenize_input_prompts(const llama_vocab * vocab, const json & json_prompt, bool add_special, bool parse_special) { | ||||
|     std::vector<llama_tokens> result; | ||||
|     if (json_prompt.is_string() || json_is_array_of_mixed_numbers_strings(json_prompt)) { | ||||
|         // string or mixed | ||||
|         result.push_back(tokenize_mixed(ctx, json_prompt, add_special, parse_special)); | ||||
|         result.push_back(tokenize_mixed(vocab, json_prompt, add_special, parse_special)); | ||||
|     } else if (json_is_array_of_numbers(json_prompt)) { | ||||
|         // array of tokens | ||||
|         result.push_back(json_prompt.get<llama_tokens>()); | ||||
| @@ -179,7 +179,7 @@ static std::vector<llama_tokens> tokenize_input_prompts(llama_context * ctx, con | ||||
|         result.reserve(json_prompt.size()); | ||||
|         for (const auto & p : json_prompt) { | ||||
|             if (p.is_string() || json_is_array_of_mixed_numbers_strings(p)) { | ||||
|                 result.push_back(tokenize_mixed(ctx, p, add_special, parse_special)); | ||||
|                 result.push_back(tokenize_mixed(vocab, p, add_special, parse_special)); | ||||
|             } else if (json_is_array_of_numbers(p)) { | ||||
|                 // array of tokens | ||||
|                 result.push_back(p.get<llama_tokens>()); | ||||
| @@ -231,21 +231,23 @@ static size_t validate_utf8(const std::string& text) { | ||||
| // | ||||
|  | ||||
| // format rerank task: [BOS]query[EOS][SEP]doc[EOS] | ||||
| static llama_tokens format_rerank(const struct llama_model * model, const llama_tokens & query, const llama_tokens & doc) { | ||||
| static llama_tokens format_rerank(const struct llama_vocab * vocab, const llama_tokens & query, const llama_tokens & doc) { | ||||
|     llama_tokens result; | ||||
|  | ||||
|     result.reserve(doc.size() + query.size() + 4); | ||||
|     result.push_back(llama_token_bos(model)); | ||||
|     result.push_back(llama_vocab_bos(vocab)); | ||||
|     result.insert(result.end(), query.begin(), query.end()); | ||||
|     result.push_back(llama_token_eos(model)); | ||||
|     result.push_back(llama_token_sep(model)); | ||||
|     result.push_back(llama_vocab_eos(vocab)); | ||||
|     result.push_back(llama_vocab_sep(vocab)); | ||||
|     result.insert(result.end(), doc.begin(), doc.end()); | ||||
|     result.push_back(llama_token_eos(model)); | ||||
|     result.push_back(llama_vocab_eos(vocab)); | ||||
|  | ||||
|     return result; | ||||
| } | ||||
|  | ||||
| // format infill task | ||||
| static llama_tokens format_infill( | ||||
|         const llama_context * ctx, | ||||
|         const llama_vocab * vocab, | ||||
|         const json & input_prefix, | ||||
|         const json & input_suffix, | ||||
|         const json & input_extra, | ||||
| @@ -272,15 +274,14 @@ static llama_tokens format_infill( | ||||
|     llama_tokens extra_tokens; | ||||
|     extra_tokens.reserve(n_ctx); | ||||
|  | ||||
|     auto model = llama_get_model(ctx); | ||||
|     auto tokens_prefix = tokenize_mixed(ctx, input_prefix, false, false); | ||||
|     auto tokens_suffix = tokenize_mixed(ctx, input_suffix, false, false); | ||||
|     auto tokens_prefix = tokenize_mixed(vocab, input_prefix, false, false); | ||||
|     auto tokens_suffix = tokenize_mixed(vocab, input_suffix, false, false); | ||||
|  | ||||
|     if (llama_token_fim_rep(model) != LLAMA_TOKEN_NULL) { | ||||
|     if (llama_vocab_fim_rep(vocab) != LLAMA_TOKEN_NULL) { | ||||
|         // TODO: make project name an input | ||||
|         static const auto k_fim_repo = common_tokenize(ctx, "myproject\n", false, false); | ||||
|         static const auto k_fim_repo = common_tokenize(vocab, "myproject\n", false, false); | ||||
|  | ||||
|         extra_tokens.push_back(llama_token_fim_rep(model)); | ||||
|         extra_tokens.push_back(llama_vocab_fim_rep(vocab)); | ||||
|         extra_tokens.insert(extra_tokens.end(), k_fim_repo.begin(), k_fim_repo.end()); | ||||
|     } | ||||
|     for (const auto & chunk : input_extra) { | ||||
| @@ -288,28 +289,28 @@ static llama_tokens format_infill( | ||||
|         const std::string text     = json_value(chunk, "text",     std::string()); | ||||
|         const std::string filename = json_value(chunk, "filename", std::string("tmp")); | ||||
|  | ||||
|         if (llama_token_fim_sep(model) != LLAMA_TOKEN_NULL) { | ||||
|             const auto k_fim_file = common_tokenize(ctx, filename + "\n", false, false); | ||||
|         if (llama_vocab_fim_sep(vocab) != LLAMA_TOKEN_NULL) { | ||||
|             const auto k_fim_file = common_tokenize(vocab, filename + "\n", false, false); | ||||
|  | ||||
|             extra_tokens.insert(extra_tokens.end(), llama_token_fim_sep(model)); | ||||
|             extra_tokens.insert(extra_tokens.end(), llama_vocab_fim_sep(vocab)); | ||||
|             extra_tokens.insert(extra_tokens.end(), k_fim_file.begin(), k_fim_file.end()); | ||||
|         } else { | ||||
|             // chunk separator in binary form to avoid confusing the AI | ||||
|             static const char k_chunk_prefix_str[] = {0x0a, 0x0a, 0x2d, 0x2d, 0x2d, 0x20, 0x73, 0x6e, 0x69, 0x70, 0x70, 0x65, 0x74, 0x20, 0x2d, 0x2d, 0x2d, 0x0a, 0x0a, 0x00}; | ||||
|             static const auto k_chunk_prefix_tokens = common_tokenize(ctx, k_chunk_prefix_str, false, false); | ||||
|             static const auto k_chunk_prefix_tokens = common_tokenize(vocab, k_chunk_prefix_str, false, false); | ||||
|  | ||||
|             extra_tokens.insert(extra_tokens.end(), k_chunk_prefix_tokens.begin(), k_chunk_prefix_tokens.end()); | ||||
|         } | ||||
|  | ||||
|         const auto chunk_tokens = common_tokenize(ctx, text, false, false); | ||||
|         const auto chunk_tokens = common_tokenize(vocab, text, false, false); | ||||
|         extra_tokens.insert(extra_tokens.end(), chunk_tokens.begin(), chunk_tokens.end()); | ||||
|     } | ||||
|  | ||||
|     if (llama_token_fim_sep(model) != LLAMA_TOKEN_NULL) { | ||||
|     if (llama_vocab_fim_sep(vocab) != LLAMA_TOKEN_NULL) { | ||||
|         // TODO: current filename | ||||
|         static const auto k_fim_file = common_tokenize(ctx, "filename\n", false, false); | ||||
|         static const auto k_fim_file = common_tokenize(vocab, "filename\n", false, false); | ||||
|  | ||||
|         extra_tokens.insert(extra_tokens.end(), llama_token_fim_sep(model)); | ||||
|         extra_tokens.insert(extra_tokens.end(), llama_vocab_fim_sep(vocab)); | ||||
|         extra_tokens.insert(extra_tokens.end(), k_fim_file.begin(), k_fim_file.end()); | ||||
|     } | ||||
|  | ||||
| @@ -325,15 +326,15 @@ static llama_tokens format_infill( | ||||
|     tokens_prefix.erase(tokens_prefix.begin(), tokens_prefix.begin() + tokens_prefix.size() - n_prefix_take); | ||||
|     tokens_suffix.resize(n_suffix_take); | ||||
|  | ||||
|     tokens_prefix.insert(tokens_prefix.begin(), llama_token_fim_pre(model)); | ||||
|     tokens_prefix.insert(tokens_prefix.begin(), llama_vocab_fim_pre(vocab)); | ||||
|     tokens_prefix.insert(tokens_prefix.end(),   tokens_prompt.begin(), tokens_prompt.end()); | ||||
|     tokens_suffix.insert(tokens_suffix.begin(), llama_token_fim_suf(model)); | ||||
|     tokens_suffix.insert(tokens_suffix.begin(), llama_vocab_fim_suf(vocab)); | ||||
|  | ||||
|     auto embd_inp = spm_infill ? tokens_suffix : tokens_prefix; | ||||
|     auto embd_end = spm_infill ? tokens_prefix : tokens_suffix; | ||||
|  | ||||
|     if (llama_add_bos_token(model)) { | ||||
|         embd_inp.insert(embd_inp.begin(), llama_token_bos(model)); | ||||
|     if (llama_vocab_get_add_bos(vocab)) { | ||||
|         embd_inp.insert(embd_inp.begin(), llama_vocab_bos(vocab)); | ||||
|     } | ||||
|  | ||||
|     SRV_DBG("extra: n_ctx = %d, n_extra_take = %d, n_extra = %d\n", n_ctx, n_extra_take, (int) extra_tokens.size()); | ||||
| @@ -342,7 +343,7 @@ static llama_tokens format_infill( | ||||
|     embd_inp.insert(embd_inp.begin(), extra_tokens.end() - n_extra_take, extra_tokens.end()); | ||||
|  | ||||
|     embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end()); | ||||
|     embd_inp.push_back(llama_token_fim_mid(model)); | ||||
|     embd_inp.push_back(llama_vocab_fim_mid(vocab)); | ||||
|  | ||||
|     return embd_inp; | ||||
| } | ||||
| @@ -764,14 +765,18 @@ static json format_logit_bias(const std::vector<llama_logit_bias> & logit_bias) | ||||
|     return data; | ||||
| } | ||||
|  | ||||
| static std::string safe_json_to_str(json data) { | ||||
| static std::string safe_json_to_str(const json & data) { | ||||
|     return data.dump(-1, ' ', false, json::error_handler_t::replace); | ||||
| } | ||||
|  | ||||
| static std::vector<llama_token_data> get_token_probabilities(llama_context * ctx, int idx) { | ||||
|     std::vector<llama_token_data> cur; | ||||
|     const auto * logits = llama_get_logits_ith(ctx, idx); | ||||
|     const int n_vocab = llama_n_vocab(llama_get_model(ctx)); | ||||
|  | ||||
|     const llama_model * model = llama_get_model(ctx); | ||||
|     const llama_vocab * vocab = llama_model_get_vocab(model); | ||||
|  | ||||
|     const int n_vocab = llama_vocab_n_tokens(vocab); | ||||
|  | ||||
|     cur.resize(n_vocab); | ||||
|     for (llama_token token_id = 0; token_id < n_vocab; token_id++) { | ||||
| @@ -799,8 +804,8 @@ static std::vector<llama_token_data> get_token_probabilities(llama_context * ctx | ||||
| } | ||||
|  | ||||
| static bool are_lora_equal( | ||||
|         const std::vector<common_lora_adapter_info> & l1, | ||||
|         const std::vector<common_lora_adapter_info> & l2) { | ||||
|         const std::vector<common_adapter_lora_info> & l1, | ||||
|         const std::vector<common_adapter_lora_info> & l2) { | ||||
|     if (l1.size() != l2.size()) { | ||||
|         return false; | ||||
|     } | ||||
| @@ -814,10 +819,10 @@ static bool are_lora_equal( | ||||
| } | ||||
|  | ||||
| // parse lora config from JSON request, returned a copy of lora_base with updated scale | ||||
| static std::vector<common_lora_adapter_info> parse_lora_request( | ||||
|         const std::vector<common_lora_adapter_info> & lora_base, | ||||
| static std::vector<common_adapter_lora_info> parse_lora_request( | ||||
|         const std::vector<common_adapter_lora_info> & lora_base, | ||||
|         const json & data) { | ||||
|     std::vector<common_lora_adapter_info> lora(lora_base); | ||||
|     std::vector<common_adapter_lora_info> lora(lora_base); | ||||
|     int max_idx = lora.size(); | ||||
|  | ||||
|     // clear existing value | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov