mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-29 08:41:22 +00:00 
			
		
		
		
	| @@ -378,7 +378,7 @@ struct server_queue { | |||||||
|     std::condition_variable condition_tasks; |     std::condition_variable condition_tasks; | ||||||
|  |  | ||||||
|     // callback functions |     // callback functions | ||||||
|     std::function<void(server_task&)> callback_new_task; |     std::function<void(server_task)> callback_new_task; | ||||||
|     std::function<void(void)>        callback_update_slots; |     std::function<void(void)>        callback_update_slots; | ||||||
|  |  | ||||||
|     // Add a new task to the end of the queue |     // Add a new task to the end of the queue | ||||||
| @@ -431,7 +431,7 @@ struct server_queue { | |||||||
|     } |     } | ||||||
|  |  | ||||||
|     // Register function to process a new task |     // Register function to process a new task | ||||||
|     void on_new_task(std::function<void(server_task &)> callback) { |     void on_new_task(std::function<void(server_task)> callback) { | ||||||
|         callback_new_task = std::move(callback); |         callback_new_task = std::move(callback); | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -481,7 +481,7 @@ struct server_queue { | |||||||
|                 lock.unlock(); |                 lock.unlock(); | ||||||
|  |  | ||||||
|                 QUE_DBG("processing task, id = %d\n", task.id); |                 QUE_DBG("processing task, id = %d\n", task.id); | ||||||
|                 callback_new_task(task); |                 callback_new_task(std::move(task)); | ||||||
|             } |             } | ||||||
|  |  | ||||||
|             // all tasks in the current loop is processed, slots data is now ready |             // all tasks in the current loop is processed, slots data is now ready | ||||||
| @@ -644,17 +644,12 @@ struct server_context { | |||||||
|     bool load_model(const common_params & params_) { |     bool load_model(const common_params & params_) { | ||||||
|         params = params_; |         params = params_; | ||||||
|  |  | ||||||
|         // reserve one extra sequence (seq_id == 0) for extra features |  | ||||||
|         params.n_parallel += 1; |  | ||||||
|  |  | ||||||
|         common_init_result llama_init = common_init_from_params(params); |         common_init_result llama_init = common_init_from_params(params); | ||||||
|  |  | ||||||
|         model = llama_init.model; |         model = llama_init.model; | ||||||
|         ctx   = llama_init.context; |         ctx   = llama_init.context; | ||||||
|         loras = llama_init.lora_adapters; |         loras = llama_init.lora_adapters; | ||||||
|  |  | ||||||
|         params.n_parallel -= 1; // but be sneaky about it |  | ||||||
|  |  | ||||||
|         if (model == nullptr) { |         if (model == nullptr) { | ||||||
|             SRV_ERR("failed to load model, '%s'\n", params.model.c_str()); |             SRV_ERR("failed to load model, '%s'\n", params.model.c_str()); | ||||||
|             return false; |             return false; | ||||||
| @@ -1297,7 +1292,7 @@ struct server_context { | |||||||
|         std::vector<float> embd_res(n_embd, 0.0f); |         std::vector<float> embd_res(n_embd, 0.0f); | ||||||
|  |  | ||||||
|         for (int i = 0; i < batch.n_tokens; ++i) { |         for (int i = 0; i < batch.n_tokens; ++i) { | ||||||
|             if (!batch.logits[i] || batch.seq_id[i][0] != slot.id + 1) { |             if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) { | ||||||
|                 continue; |                 continue; | ||||||
|             } |             } | ||||||
|  |  | ||||||
| @@ -1337,7 +1332,7 @@ struct server_context { | |||||||
|         res.stop  = true; |         res.stop  = true; | ||||||
|  |  | ||||||
|         for (int i = 0; i < batch.n_tokens; ++i) { |         for (int i = 0; i < batch.n_tokens; ++i) { | ||||||
|             if (!batch.logits[i] || batch.seq_id[i][0] != slot.id + 1) { |             if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) { | ||||||
|                 continue; |                 continue; | ||||||
|             } |             } | ||||||
|  |  | ||||||
| @@ -1510,7 +1505,7 @@ struct server_context { | |||||||
|     // Functions to process the task |     // Functions to process the task | ||||||
|     // |     // | ||||||
|  |  | ||||||
|     void process_single_task(const server_task & task) { |     void process_single_task(server_task task) { | ||||||
|         switch (task.type) { |         switch (task.type) { | ||||||
|             case SERVER_TASK_TYPE_INFERENCE: |             case SERVER_TASK_TYPE_INFERENCE: | ||||||
|                 { |                 { | ||||||
| @@ -1646,7 +1641,7 @@ struct server_context { | |||||||
|                     std::string filename = task.data.at("filename"); |                     std::string filename = task.data.at("filename"); | ||||||
|                     std::string filepath = task.data.at("filepath"); |                     std::string filepath = task.data.at("filepath"); | ||||||
|  |  | ||||||
|                     const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id + 1, slot->cache_tokens.data(), token_count); |                     const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id, slot->cache_tokens.data(), token_count); | ||||||
|  |  | ||||||
|                     const int64_t t_end = ggml_time_us(); |                     const int64_t t_end = ggml_time_us(); | ||||||
|                     const double t_save_ms = (t_end - t_start) / 1000.0; |                     const double t_save_ms = (t_end - t_start) / 1000.0; | ||||||
| @@ -1688,7 +1683,7 @@ struct server_context { | |||||||
|  |  | ||||||
|                     slot->cache_tokens.resize(slot->n_ctx); |                     slot->cache_tokens.resize(slot->n_ctx); | ||||||
|                     size_t token_count = 0; |                     size_t token_count = 0; | ||||||
|                     size_t nread = llama_state_seq_load_file(ctx, filepath.c_str(), slot->id + 1, slot->cache_tokens.data(), slot->cache_tokens.size(), &token_count); |                     size_t nread = llama_state_seq_load_file(ctx, filepath.c_str(), slot->id, slot->cache_tokens.data(), slot->cache_tokens.size(), &token_count); | ||||||
|                     if (nread == 0) { |                     if (nread == 0) { | ||||||
|                         slot->cache_tokens.resize(0); |                         slot->cache_tokens.resize(0); | ||||||
|                         send_error(task, "Unable to restore slot, no available space in KV cache or invalid slot save file", ERROR_TYPE_INVALID_REQUEST); |                         send_error(task, "Unable to restore slot, no available space in KV cache or invalid slot save file", ERROR_TYPE_INVALID_REQUEST); | ||||||
| @@ -1731,7 +1726,7 @@ struct server_context { | |||||||
|  |  | ||||||
|                     // Erase token cache |                     // Erase token cache | ||||||
|                     const size_t n_erased = slot->cache_tokens.size(); |                     const size_t n_erased = slot->cache_tokens.size(); | ||||||
|                     llama_kv_cache_seq_rm(ctx, slot->id + 1, -1, -1); |                     llama_kv_cache_seq_rm(ctx, slot->id, -1, -1); | ||||||
|                     slot->cache_tokens.clear(); |                     slot->cache_tokens.clear(); | ||||||
|  |  | ||||||
|                     server_task_result result; |                     server_task_result result; | ||||||
| @@ -1808,8 +1803,8 @@ struct server_context { | |||||||
|  |  | ||||||
|                 SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard); |                 SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard); | ||||||
|  |  | ||||||
|                 llama_kv_cache_seq_rm (ctx, slot.id + 1, n_keep            , n_keep + n_discard); |                 llama_kv_cache_seq_rm (ctx, slot.id, n_keep            , n_keep + n_discard); | ||||||
|                 llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard, slot.n_past,        -n_discard); |                 llama_kv_cache_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past,        -n_discard); | ||||||
|  |  | ||||||
|                 if (slot.params.cache_prompt) { |                 if (slot.params.cache_prompt) { | ||||||
|                     for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) { |                     for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) { | ||||||
| @@ -1836,7 +1831,7 @@ struct server_context { | |||||||
|  |  | ||||||
|             slot.i_batch = batch.n_tokens; |             slot.i_batch = batch.n_tokens; | ||||||
|  |  | ||||||
|             common_batch_add(batch, slot.sampled, slot.n_past, { slot.id + 1 }, true); |             common_batch_add(batch, slot.sampled, slot.n_past, { slot.id }, true); | ||||||
|  |  | ||||||
|             slot.n_past += 1; |             slot.n_past += 1; | ||||||
|  |  | ||||||
| @@ -1983,8 +1978,8 @@ struct server_context { | |||||||
|  |  | ||||||
|                                             const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c; |                                             const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c; | ||||||
|  |  | ||||||
|                                             llama_kv_cache_seq_rm (ctx, slot.id + 1, head_p, head_c); |                                             llama_kv_cache_seq_rm (ctx, slot.id, head_p, head_c); | ||||||
|                                             llama_kv_cache_seq_add(ctx, slot.id + 1, head_c, -1,     kv_shift); |                                             llama_kv_cache_seq_add(ctx, slot.id, head_c, -1,     kv_shift); | ||||||
|  |  | ||||||
|                                             for (size_t i = 0; i < n_match; i++) { |                                             for (size_t i = 0; i < n_match; i++) { | ||||||
|                                                 slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i]; |                                                 slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i]; | ||||||
| @@ -2033,9 +2028,9 @@ struct server_context { | |||||||
|                     } |                     } | ||||||
|  |  | ||||||
|                     // keep only the common part |                     // keep only the common part | ||||||
|                     if (!llama_kv_cache_seq_rm(ctx, slot.id + 1, slot.n_past, -1)) { |                     if (!llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1)) { | ||||||
|                         // could not partially delete (likely using a non-Transformer model) |                         // could not partially delete (likely using a non-Transformer model) | ||||||
|                         llama_kv_cache_seq_rm(ctx, slot.id + 1, -1, -1); |                         llama_kv_cache_seq_rm(ctx, slot.id, -1, -1); | ||||||
|  |  | ||||||
|                         // there is no common part left |                         // there is no common part left | ||||||
|                         slot.n_past = 0; |                         slot.n_past = 0; | ||||||
| @@ -2048,7 +2043,7 @@ struct server_context { | |||||||
|  |  | ||||||
|                     // add prompt tokens for processing in the current batch |                     // add prompt tokens for processing in the current batch | ||||||
|                     while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch) { |                     while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch) { | ||||||
|                         common_batch_add(batch, prompt_tokens[slot.n_past], slot.n_past, { slot.id + 1 }, false); |                         common_batch_add(batch, prompt_tokens[slot.n_past], slot.n_past, { slot.id }, false); | ||||||
|  |  | ||||||
|                         if (slot.params.cache_prompt) { |                         if (slot.params.cache_prompt) { | ||||||
|                             slot.cache_tokens.push_back(prompt_tokens[slot.n_past]); |                             slot.cache_tokens.push_back(prompt_tokens[slot.n_past]); | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Georgi Gerganov
					Georgi Gerganov