mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	Merge branch 'master' into xsn/private_batch_api
This commit is contained in:
		| @@ -131,9 +131,9 @@ struct slot_params { | ||||
|             lora.push_back({{"id", i}, {"scale", this->lora[i].scale}}); | ||||
|         } | ||||
|  | ||||
|         std::vector<std::string> grammar_trigger_words; | ||||
|         for (const auto & trigger : sampling.grammar_trigger_words) { | ||||
|             grammar_trigger_words.push_back(trigger.word); | ||||
|         auto grammar_triggers = json::array(); | ||||
|         for (const auto & trigger : sampling.grammar_triggers) { | ||||
|             grammar_triggers.push_back(trigger.to_json<json>()); | ||||
|         } | ||||
|  | ||||
|         return json { | ||||
| @@ -170,8 +170,8 @@ struct slot_params { | ||||
|             {"n_probs",                   sampling.n_probs}, | ||||
|             {"min_keep",                  sampling.min_keep}, | ||||
|             {"grammar",                   sampling.grammar}, | ||||
|             {"grammar_trigger_words",     grammar_trigger_words}, | ||||
|             {"grammar_trigger_tokens",    sampling.grammar_trigger_tokens}, | ||||
|             {"grammar_lazy",              sampling.grammar_lazy}, | ||||
|             {"grammar_triggers",          grammar_triggers}, | ||||
|             {"preserved_tokens",          sampling.preserved_tokens}, | ||||
|             {"chat_format",               common_chat_format_name(oaicompat_chat_format)}, | ||||
|             {"samplers",                  samplers}, | ||||
| @@ -356,24 +356,6 @@ struct server_task { | ||||
|         } | ||||
|  | ||||
|         { | ||||
|             const auto grammar_triggers = data.find("grammar_triggers"); | ||||
|             if (grammar_triggers != data.end()) { | ||||
|                 for (const auto & t : *grammar_triggers) { | ||||
|                     common_grammar_trigger trigger; | ||||
|                     trigger.word = t.at("word"); | ||||
|                     trigger.at_start = t.at("at_start"); | ||||
|  | ||||
|                     auto ids = common_tokenize(vocab, trigger.word, /* add_special= */ false, /* parse_special= */ true); | ||||
|                     if (ids.size() == 1) { | ||||
|                         SRV_DBG("Grammar trigger token: %d (`%s`)\n", ids[0], trigger.word.c_str()); | ||||
|                         params.sampling.grammar_trigger_tokens.push_back(ids[0]); | ||||
|                         params.sampling.preserved_tokens.insert(ids[0]); | ||||
|                         continue; | ||||
|                     } | ||||
|                     SRV_DBG("Grammar trigger word: `%s`\n", trigger.word.c_str()); | ||||
|                     params.sampling.grammar_trigger_words.push_back(trigger); | ||||
|                 } | ||||
|             } | ||||
|             const auto preserved_tokens = data.find("preserved_tokens"); | ||||
|             if (preserved_tokens != data.end()) { | ||||
|                 for (const auto & t : *preserved_tokens) { | ||||
| @@ -383,12 +365,39 @@ struct server_task { | ||||
|                         params.sampling.preserved_tokens.insert(ids[0]); | ||||
|                     } else { | ||||
|                         // This may happen when using a tool call style meant for a model with special tokens to preserve on a model without said tokens. | ||||
|                         SRV_WRN("Not preserved because more than 1 token (wrong chat template override?): %s\n", t.get<std::string>().c_str()); | ||||
|                         SRV_DBG("Not preserved because more than 1 token: %s\n", t.get<std::string>().c_str()); | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
|             if (params.sampling.grammar_lazy) { | ||||
|                 GGML_ASSERT(params.sampling.grammar_trigger_tokens.size() > 0 || params.sampling.grammar_trigger_words.size() > 0); | ||||
|             const auto grammar_triggers = data.find("grammar_triggers"); | ||||
|             if (grammar_triggers != data.end()) { | ||||
|                 for (const auto & t : *grammar_triggers) { | ||||
|                     auto ct = common_grammar_trigger::from_json(t); | ||||
|                     if (ct.type == COMMON_GRAMMAR_TRIGGER_TYPE_WORD) { | ||||
|                         const auto & word = ct.value; | ||||
|                         auto ids = common_tokenize(vocab, word, /* add_special= */ false, /* parse_special= */ true); | ||||
|                         if (ids.size() == 1) { | ||||
|                             auto token = ids[0]; | ||||
|                             if (std::find(params.sampling.preserved_tokens.begin(), params.sampling.preserved_tokens.end(), (llama_token) token) == params.sampling.preserved_tokens.end()) { | ||||
|                                 throw std::runtime_error("Grammar trigger word should be marked as preserved token: " + word); | ||||
|                             } | ||||
|                             SRV_DBG("Grammar trigger token: %d (`%s`)\n", token, word.c_str()); | ||||
|                             common_grammar_trigger trigger; | ||||
|                             trigger.type = COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN; | ||||
|                             trigger.value = word; | ||||
|                             trigger.token = token; | ||||
|                             params.sampling.grammar_triggers.push_back(std::move(trigger)); | ||||
|                         } else { | ||||
|                             SRV_DBG("Grammar trigger word: `%s`\n", word.c_str()); | ||||
|                             params.sampling.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word}); | ||||
|                         } | ||||
|                     } else { | ||||
|                         params.sampling.grammar_triggers.push_back(ct); | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
|             if (params.sampling.grammar_lazy && params.sampling.grammar_triggers.empty()) { | ||||
|                 throw std::runtime_error("Error: no triggers set for lazy grammar!"); | ||||
|             } | ||||
|         } | ||||
|  | ||||
| @@ -742,7 +751,10 @@ struct server_task_result_cmpl_final : server_task_result { | ||||
|                         {"name", tc.name}, | ||||
|                         {"arguments", tc.arguments}, | ||||
|                     }}, | ||||
|                     {"id", tc.id}, | ||||
|                     // Some templates generate and require an id (sometimes in a very specific format, e.g. Mistral Nemo). | ||||
|                     // We only generate a random id for the ones that don't generate one by themselves | ||||
|                     // (they also won't get to see it as their template likely doesn't use it, so it's all for the client) | ||||
|                     {"id", tc.id.empty() ? gen_tool_call_id() : tc.id}, | ||||
|                 }); | ||||
|             } | ||||
|             message["tool_calls"] = tool_calls; | ||||
| @@ -1304,7 +1316,7 @@ struct server_slot { | ||||
|         return task_type == SERVER_TASK_TYPE_EMBEDDING || task_type == SERVER_TASK_TYPE_RERANK; | ||||
|     } | ||||
|  | ||||
|     bool can_batch_with(server_slot & other_slot) { | ||||
|     bool can_batch_with(server_slot & other_slot) const { | ||||
|         return is_non_causal() == other_slot.is_non_causal() | ||||
|             && are_lora_equal(lora, other_slot.lora); | ||||
|     } | ||||
| @@ -1888,6 +1900,7 @@ struct server_context { | ||||
|         try { | ||||
|             common_chat_format_example(chat_templates.get(), params.use_jinja); | ||||
|         } catch (const std::exception & e) { | ||||
|             SRV_WRN("%s: Chat template parsing error: %s\n", __func__, e.what()); | ||||
|             SRV_WRN("%s: The chat template that comes with this model is not yet supported, falling back to chatml. This may cause the model to output suboptimal responses\n", __func__); | ||||
|             chat_templates = common_chat_templates_init(model, "chatml"); | ||||
|         } | ||||
| @@ -2023,6 +2036,18 @@ struct server_context { | ||||
|         return ret; | ||||
|     } | ||||
|  | ||||
|     bool can_be_detokenized(const struct llama_context * ctx, const std::vector<llama_token> & tokens) { | ||||
|         const llama_model * model = llama_get_model(ctx); | ||||
|         const llama_vocab * vocab = llama_model_get_vocab(model); | ||||
|         const int32_t n_vocab = llama_vocab_n_tokens(vocab); | ||||
|         for (const auto & token : tokens) { | ||||
|             if (token < 0 || token >= n_vocab) { | ||||
|                 return false; | ||||
|             } | ||||
|         } | ||||
|         return true; | ||||
|     } | ||||
|  | ||||
|     bool launch_slot_with_task(server_slot & slot, const server_task & task) { | ||||
|         slot.reset(); | ||||
|         slot.id_task       = task.id; | ||||
| @@ -2037,11 +2062,16 @@ struct server_context { | ||||
|             slot.lora = task.params.lora; | ||||
|         } | ||||
|  | ||||
|         bool can_detokenize = can_be_detokenized(ctx, slot.prompt_tokens); | ||||
|         if (!can_detokenize) { | ||||
|             send_error(task, "Prompt contains invalid tokens", ERROR_TYPE_INVALID_REQUEST); | ||||
|             return false; | ||||
|         } | ||||
|         SLT_DBG(slot, "launching slot : %s\n", safe_json_to_str(slot.to_json()).c_str()); | ||||
|  | ||||
|         if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) { | ||||
|             // Might be better to reject the request with a 400 ? | ||||
|             SLT_WRN(slot, "n_predict = %d exceeds server configuration, setting to %d", slot.params.n_predict, slot.n_predict); | ||||
|             SLT_WRN(slot, "n_predict = %d exceeds server configuration, setting to %d\n", slot.params.n_predict, slot.n_predict); | ||||
|             slot.params.n_predict = slot.n_predict; | ||||
|         } | ||||
|  | ||||
| @@ -2077,7 +2107,7 @@ struct server_context { | ||||
|         SRV_DBG("%s", "clearing KV cache\n"); | ||||
|  | ||||
|         // clear the entire KV cache | ||||
|         llama_kv_cache_clear(ctx); | ||||
|         llama_kv_self_clear(ctx); | ||||
|         clean_kv_cache = false; | ||||
|     } | ||||
|  | ||||
| @@ -2142,14 +2172,6 @@ struct server_context { | ||||
|         } | ||||
|  | ||||
|         if (slot.has_new_line) { | ||||
|             // if we have already seen a new line, we stop after a certain time limit | ||||
|             if (slot.params.t_max_predict_ms > 0 && (ggml_time_us() - slot.t_start_generation > 1000.0f*slot.params.t_max_predict_ms)) { | ||||
|                 slot.stop           = STOP_TYPE_LIMIT; | ||||
|                 slot.has_next_token = false; | ||||
|  | ||||
|                 SLT_DBG(slot, "stopped by time limit, n_decoded = %d, t_max_predict_ms = %d ms\n", slot.n_decoded, (int) slot.params.t_max_predict_ms); | ||||
|             } | ||||
|  | ||||
|             // require that each new line has a whitespace prefix (i.e. indentation) of at least slot.params.n_indent | ||||
|             if (slot.params.n_indent > 0) { | ||||
|                 // check the current indentation | ||||
| @@ -2188,6 +2210,14 @@ struct server_context { | ||||
|         // check if there is a new line in the generated text | ||||
|         if (result.text_to_send.find('\n') != std::string::npos) { | ||||
|             slot.has_new_line = true; | ||||
|  | ||||
|             // if we have seen a new line, we stop after a certain time limit, but only upon another new line | ||||
|             if (slot.params.t_max_predict_ms > 0 && (ggml_time_us() - slot.t_start_generation > 1000.0f*slot.params.t_max_predict_ms)) { | ||||
|                 slot.stop           = STOP_TYPE_LIMIT; | ||||
|                 slot.has_next_token = false; | ||||
|  | ||||
|                 SLT_DBG(slot, "stopped by time limit, n_decoded = %d, t_max_predict_ms = %d ms\n", slot.n_decoded, (int) slot.params.t_max_predict_ms); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         // if context shift is disabled, we stop when it reaches the context limit | ||||
| @@ -2621,8 +2651,8 @@ struct server_context { | ||||
|                     res->n_tasks_deferred    = queue_tasks.queue_tasks_deferred.size(); | ||||
|                     res->t_start             = metrics.t_start; | ||||
|  | ||||
|                     res->kv_cache_tokens_count = llama_get_kv_cache_token_count(ctx); | ||||
|                     res->kv_cache_used_cells   = llama_get_kv_cache_used_cells(ctx); | ||||
|                     res->kv_cache_tokens_count = llama_kv_self_n_tokens(ctx); | ||||
|                     res->kv_cache_used_cells   = llama_kv_self_used_cells(ctx); | ||||
|  | ||||
|                     res->n_prompt_tokens_processed_total = metrics.n_prompt_tokens_processed_total; | ||||
|                     res->t_prompt_processing_total       = metrics.t_prompt_processing_total; | ||||
| @@ -2738,7 +2768,7 @@ struct server_context { | ||||
|  | ||||
|                     // Erase token cache | ||||
|                     const size_t n_erased = slot->cache_tokens.size(); | ||||
|                     llama_kv_cache_seq_rm(ctx, slot->id, -1, -1); | ||||
|                     llama_kv_self_seq_rm(ctx, slot->id, -1, -1); | ||||
|                     slot->cache_tokens.clear(); | ||||
|  | ||||
|                     auto res = std::make_unique<server_task_result_slot_erase>(); | ||||
| @@ -2806,8 +2836,8 @@ struct server_context { | ||||
|  | ||||
|                 SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard); | ||||
|  | ||||
|                 llama_kv_cache_seq_rm (ctx, slot.id, n_keep            , n_keep + n_discard); | ||||
|                 llama_kv_cache_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past,        -n_discard); | ||||
|                 llama_kv_self_seq_rm (ctx, slot.id, n_keep            , n_keep + n_discard); | ||||
|                 llama_kv_self_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past,        -n_discard); | ||||
|  | ||||
|                 if (slot.params.cache_prompt) { | ||||
|                     for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) { | ||||
| @@ -2998,8 +3028,8 @@ struct server_context { | ||||
|  | ||||
|                                             const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c; | ||||
|  | ||||
|                                             llama_kv_cache_seq_rm (ctx, slot.id, head_p, head_c); | ||||
|                                             llama_kv_cache_seq_add(ctx, slot.id, head_c, -1,     kv_shift); | ||||
|                                             llama_kv_self_seq_rm (ctx, slot.id, head_p, head_c); | ||||
|                                             llama_kv_self_seq_add(ctx, slot.id, head_c, head_c + n_match, kv_shift); | ||||
|  | ||||
|                                             for (size_t i = 0; i < n_match; i++) { | ||||
|                                                 slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i]; | ||||
| @@ -3037,9 +3067,9 @@ struct server_context { | ||||
|                     } | ||||
|  | ||||
|                     // keep only the common part | ||||
|                     if (!llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1)) { | ||||
|                     if (!llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1)) { | ||||
|                         // could not partially delete (likely using a non-Transformer model) | ||||
|                         llama_kv_cache_seq_rm(ctx, slot.id, -1, -1); | ||||
|                         llama_kv_self_seq_rm(ctx, slot.id, -1, -1); | ||||
|  | ||||
|                         // there is no common part left | ||||
|                         slot.n_past = 0; | ||||
| @@ -3271,7 +3301,7 @@ struct server_context { | ||||
|                 slot.cache_tokens.push_back(id); | ||||
|                 slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1); | ||||
|  | ||||
|                 llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1); | ||||
|                 llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1); | ||||
|  | ||||
|                 for (size_t i = 0; i < ids.size(); ++i) { | ||||
|                     completion_token_output result; | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Xuan Son Nguyen
					Xuan Son Nguyen