llama : move sampling code into llama-sampling

ggml-ci
2025-11-09 10:17:06 +00:00 · 2024-07-19 18:15:36 +03:00
parent 081fe431aa
commit 0ddc8e361c
7 changed files with 758 additions and 699 deletions
--- a/include/llama.h
+++ b/include/llama.h
@@ -1084,12 +1084,6 @@ extern "C" {
          llama_token_data_array * candidates,
                           float   temp);

-    /// @details Apply constraints from grammar
-    LLAMA_API void llama_sample_grammar(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates,
-      const struct llama_grammar * grammar);
-
    /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
    /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
@@ -1127,6 +1121,12 @@ extern "C" {
            struct llama_context * ctx,
          llama_token_data_array * candidates);

+    /// @details Apply constraints from grammar
+    LLAMA_API void llama_sample_grammar(
+            struct llama_context * ctx,
+          llama_token_data_array * candidates,
+      const struct llama_grammar * grammar);
+
    /// @details Accepts the sampled token into the grammar
    LLAMA_API void llama_grammar_accept_token(
            struct llama_context * ctx,