llama : add llama_vocab, functions -> methods, naming (#11110)

* llama : functions -> methods (#11110) * llama : add struct llama_vocab to the API (#11156) ggml-ci * hparams : move vocab params to llama_vocab (#11159) ggml-ci * vocab : more pimpl (#11165) ggml-ci * vocab : minor tokenization optimizations (#11160) ggml-ci Co-authored-by: Diego Devesa <slarengh@gmail.com> * lora : update API names (#11167) ggml-ci * llama : update API names to use correct prefix (#11174) * llama : update API names to use correct prefix ggml-ci * cont ggml-ci * cont ggml-ci * minor [no ci] * vocab : llama_vocab_add_[be]os -> llama_vocab_get_add_[be]os (#11174) ggml-ci * vocab : llama_vocab_n_vocab -> llama_vocab_n_tokens (#11174) ggml-ci --------- Co-authored-by: Diego Devesa <slarengh@gmail.com>
2025-10-31 08:51:55 +00:00 · 2025-01-12 11:32:42 +02:00
parent c05e8c9934
commit afa8a9ec9b
68 changed files with 5855 additions and 5400 deletions
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -7,7 +7,6 @@
 #include <cstdio>
 #include <cstring>
 #include <ctime>
-#include <sstream>
 #include <thread>
 #include <mutex>
 #include <vector>
@@ -40,7 +39,7 @@ public:
    void set_params(common_params params) { m_params = std::move(params); }
    bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
    void save_imatrix(int ncall = -1) const;
-    bool load_imatrix(const char * file_name);
+    bool load_imatrix(const char * fname);
 private:
    std::unordered_map<std::string, Stats> m_stats;
    common_params                          m_params;
@@ -429,10 +428,13 @@ static void process_logits(
 }

 static bool compute_imatrix(llama_context * ctx, const common_params & params) {
-    const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
+    const llama_model * model = llama_get_model(ctx);
+    const llama_vocab * vocab = llama_model_get_vocab(model);
+
+    const bool add_bos = llama_vocab_get_add_bos(vocab);
    const int n_ctx = llama_n_ctx(ctx);

-    GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
+    GGML_ASSERT(!llama_vocab_get_add_eos(vocab));

    auto tim1 = std::chrono::high_resolution_clock::now();
    LOG_INF("%s: tokenizing the input ..\n", __func__);
@@ -468,7 +470,7 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
    const int n_chunk_max = tokens.size() / n_ctx;

    const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
-    const int n_vocab = llama_n_vocab(llama_get_model(ctx));
+    const int n_vocab = llama_vocab_n_tokens(vocab);
    const int n_batch = params.n_batch;

    int count = 0;
@@ -508,7 +510,7 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {

            // add BOS token for the first batch of each chunk
            if (add_bos && j == 0) {
-                tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
+                tokens[batch_start] = llama_vocab_bos(vocab);
            }

            common_batch_clear(batch);
@@ -627,7 +629,7 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    const int n_ctx_train = llama_n_ctx_train(model);
+    const int n_ctx_train = llama_model_n_ctx_train(model);
    if (params.n_ctx > n_ctx_train) {
        LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n",
                __func__, n_ctx_train, params.n_ctx);