diff --git a/tools/server/server.cpp b/tools/server/server.cpp index af2f237e5d..60d92fd705 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -3946,8 +3946,11 @@ struct server_context { // truncate any tokens that are beyond n_past for this slot const llama_pos p0 = slot.prompt.tokens.pos_next(); + + SLT_INF(slot, "n_tokens = %d, memory_seq_rm [%d, end)\n", slot.prompt.n_tokens(), p0); + if (!llama_memory_seq_rm(llama_get_memory(ctx), slot.id, p0, -1)) { - SLT_WRN(slot, "failed to truncate tokens with position >= %d\n", p0); + SLT_WRN(slot, "failed to truncate tokens with position >= %d - clearing the memory\n", p0); llama_memory_seq_rm(llama_get_memory(ctx), slot.id, -1, -1); // there is no common part left @@ -3956,8 +3959,6 @@ struct server_context { slot.prompt.tokens.clear(); } - SLT_INF(slot, "n_tokens = %d, memory_seq_rm [%d, end)\n", slot.prompt.n_tokens(), p0); - // check if we should process the image if (slot.prompt.n_tokens() < slot.task->n_tokens() && input_tokens[slot.prompt.n_tokens()] == LLAMA_TOKEN_NULL) { // process the image diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp index fb95361b28..f68420eb30 100644 --- a/tools/server/utils.hpp +++ b/tools/server/utils.hpp @@ -1210,7 +1210,7 @@ public: for (auto it = tokens.map_idx_to_media.begin(); it != tokens.map_idx_to_media.end(); ) { auto * chunk = tokens.map_idx_to_media[it->first].get(); mtmd::input_chunk_ptr new_chunk(mtmd_input_chunk_copy(chunk)); - map_idx_to_media[start_idx+it->first] = std::move(new_chunk); + map_idx_to_media[start_idx + it->first] = std::move(new_chunk); } } } @@ -1242,6 +1242,7 @@ public: } void clear() { + map_idx_to_media.clear(); tokens.clear(); }