llama : add LLAMA_OFFLOAD_DEBUG + fix starcoder offloading

2025-10-31 08:51:55 +00:00 · 2023-10-30 12:14:23 +02:00
parent 792d1a1b16
commit a3f80013ad
1 changed files with 77 additions and 38 deletions
--- a/llama.cpp
+++ b/llama.cpp
@@ -3548,11 +3548,11 @@ static struct ggml_cgraph * llm_build_llama(
                    model.layers[il].ffn_gate, NULL,
                    model.layers[il].ffn_down, NULL,
                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-            cb(cur, "ffn_result", il);
+            cb(cur, "ffn_out", il);
        }

        cur = ggml_add(ctx0, cur, inpFF);
-        cb(cur, "inpFF_+_result_w2", il);
+        cb(cur, "inpFF_ffn_out", il);

        // input for next layer
        inpL = cur;
@@ -3714,11 +3714,11 @@ static struct ggml_cgraph * llm_build_baichaun(
                    model.layers[il].ffn_gate, NULL,
                    model.layers[il].ffn_down, NULL,
                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-            cb(cur, "ffn_result", il);
+            cb(cur, "ffn_out", il);
        }

        cur = ggml_add(ctx0, cur, inpFF);
-        cb(cur, "inpFF_+_result_w2", il);
+        cb(cur, "inpFF_ffn_out", il);

        // input for next layer
        inpL = cur;
@@ -3884,14 +3884,14 @@ static struct ggml_cgraph * llm_build_falcon(
                    NULL,                      NULL,
                    model.layers[il].ffn_down, NULL,
                    LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
-            cb(cur, "ffn_result", il);
+            cb(cur, "ffn_out", il);
        }

        cur = ggml_add(ctx0, cur, attn_out);
-        cb(cur, "inpFF_+_result_w2", il);
+        cb(cur, "inpFF_ffn_out", il);

        cur = ggml_add(ctx0, cur, inpL);
-        cb(cur, "inpL_+_inpFF_+_result_w2", il);
+        cb(cur, "inpL_inpFF_ffn_out", il);

        // input for next layer
        inpL = cur;
@@ -3988,6 +3988,7 @@ static struct ggml_cgraph * llm_build_starcoder(
    cb(KQ_mask, "KQ_mask", -1);

    pos = ggml_get_rows(ctx0, model.pos_embeddings, inp_pos);
+    cb(pos, "pos_embd", -1);

    inpL = ggml_add(ctx0, embd, pos);
    cb(inpL, "inpL", -1);
@@ -4027,7 +4028,7 @@ static struct ggml_cgraph * llm_build_starcoder(

        // Add the input
        cur = ggml_add(ctx0, cur, inpL);
-        cb(cur, "inpL_+_result_wo", il);
+        cb(cur, "inpL_kqv_out", il);

        struct ggml_tensor * inpFF = cur;

@@ -4044,11 +4045,11 @@ static struct ggml_cgraph * llm_build_starcoder(
                    NULL,                      NULL,
                    model.layers[il].ffn_down, model.layers[il].ffn_down_b,
                    LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
-            cb(cur, "ffn_result", il);
+            cb(cur, "ffn_out", il);
        }

        inpL = ggml_add(ctx0, cur, inpFF);
-
+        cb(inpL, "inpL_inpFF_ffn_out", il);
    }

    cur = llm_build_norm(ctx0, inpL,
@@ -4294,11 +4295,11 @@ static struct ggml_cgraph * llm_build_persimmon(
                    NULL,                      NULL,
                    model.layers[il].ffn_down, model.layers[il].ffn_down_b,
                    LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il);
-            cb(cur, "ffn_result", il);
+            cb(cur, "ffn_out", il);
        }

        cur = ggml_add(ctx0, cur, inpFF);
-        cb(cur, "inpFF_+_result_w2", il);
+        cb(cur, "inpFF_ffn_out", il);

        inpL = cur;
    }
@@ -4432,11 +4433,11 @@ static struct ggml_cgraph * llm_build_refact(
                    model.layers[il].ffn_gate, NULL,
                    model.layers[il].ffn_down, NULL,
                    LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
-            cb(cur, "ffn_result", il);
+            cb(cur, "ffn_out", il);
        }

        cur = ggml_add(ctx0, cur, inpFF);
-        cb(cur, "inpFF_+_result_w2", il);
+        cb(cur, "inpFF_ffn_out", il);

        // input for next layer
        inpL = cur;
@@ -4569,7 +4570,7 @@ static struct ggml_cgraph * llm_build_bloom(

        // Add the input
        cur = ggml_add(ctx0, cur, inpL);
-        cb(cur, "inpL_+_result_wo", il);
+        cb(cur, "inpL_kqv_out", il);

        struct ggml_tensor * inpFF = cur;

@@ -4586,11 +4587,11 @@ static struct ggml_cgraph * llm_build_bloom(
                    NULL,                      NULL,
                    model.layers[il].ffn_down, model.layers[il].ffn_down_b,
                    LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
-            cb(cur, "ffn_result", il);
+            cb(cur, "ffn_out", il);
        }

        inpL = ggml_add(ctx0, cur, inpFF);
-        cb(inpL, "inpFF_+_result_w2", il);
+        cb(inpL, "inpFF_ffn_out", il);
    }

    cur = llm_build_norm(ctx0, inpL,
@@ -4717,7 +4718,7 @@ static struct ggml_cgraph * llm_build_mpt(

        // Add the input
        cur = ggml_add(ctx0, cur, inpL);
-        cb(cur, "inpL_+_result_wo", il);
+        cb(cur, "inpL_kqv_out", il);

        struct ggml_tensor * attn_out = cur;

@@ -4734,11 +4735,11 @@ static struct ggml_cgraph * llm_build_mpt(
                    NULL,                      NULL,
                    model.layers[il].ffn_down, NULL,
                    LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
-            cb(cur, "ffn_result", il);
+            cb(cur, "ffn_out", il);
        }

        cur = ggml_add(ctx0, cur, attn_out);
-        cb(cur, "inpL_+_inpFF_+_result_w2", il);
+        cb(cur, "inpL_inpFF_ffn_out", il);

        // input for next layer
        inpL = cur;
@@ -4777,6 +4778,7 @@ enum llm_offload_func_e {
    OFFLOAD_FUNC_OUT,
 };

+// TODO: will be removed with backend v2
 struct llm_offload_trie {
    struct node {
        ~node() {
@@ -4850,10 +4852,12 @@ struct llm_offload_trie {
    node * root = nullptr;
 };

+// TODO: will be removed with backend v2
 static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map = {
  //{ "inp_tokens",                 OFFLOAD_FUNC_NR  }, // TODO: missing K-quants get_rows kernel
  //{ "inp_embd",                   OFFLOAD_FUNC_NR  }, // TODO: missing K-quants get_rows kernel
    { "inp_pos",                    OFFLOAD_FUNC_NR  },
+    { "pos_embd",                   OFFLOAD_FUNC_NR  },

    { "KQ_mask",                    OFFLOAD_FUNC_NR  },
    { "K_shift",                    OFFLOAD_FUNC_NR  },
@@ -4902,7 +4906,7 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
    { "kqv_wo",                     OFFLOAD_FUNC_V   },
    { "kqv_out",                    OFFLOAD_FUNC_V   },

-    { "inpL_+_result_wo",           OFFLOAD_FUNC     },
+    { "inpL_kqv_out",               OFFLOAD_FUNC     },
    { "inpFF",                      OFFLOAD_FUNC     },

    { "ffn_norm",                   OFFLOAD_FUNC     },
@@ -4914,15 +4918,15 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
    { "ffn_gate_par",               OFFLOAD_FUNC     },
    { "ffn_down",                   OFFLOAD_FUNC     },
    { "ffn_down_b",                 OFFLOAD_FUNC     },
-    { "ffn_result",                 OFFLOAD_FUNC     },
+    { "ffn_out",                    OFFLOAD_FUNC     },

    { "ffn_silu",                   OFFLOAD_FUNC     },
    { "ffn_gelu",                   OFFLOAD_FUNC     },
    { "ffn_relu",                   OFFLOAD_FUNC     },
    { "ffn_sqr(relu)",              OFFLOAD_FUNC     },

-    { "inpFF_+_result_w2",          OFFLOAD_FUNC     },
-    { "inpL_+_inpFF_+_result_w2",   OFFLOAD_FUNC     },
+    { "inpFF_ffn_out",              OFFLOAD_FUNC     },
+    { "inpL_inpFF_ffn_out",         OFFLOAD_FUNC     },

    { "result_norm",                OFFLOAD_FUNC_EMB },
    { "result_output",              OFFLOAD_FUNC_OUT },
@@ -4946,6 +4950,14 @@ static struct ggml_cgraph * llama_build_graph(
    bool alloc_inp_KQ_mask  = false;
    bool alloc_inp_K_shift  = false;

+#ifdef GGML_USE_CUBLAS
+    const bool do_offload = true;
+#else
+    const bool do_offload = true; // TODO: set to false after finishing refactoring
+#endif
+
+    int n_non_view = 0; // number of non-view tensors that have been processed by the callback
+
    // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
    llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
        if (il >= 0) {
@@ -5053,23 +5065,23 @@ static struct ggml_cgraph * llama_build_graph(
            alloc_inp_K_shift = true;
        }

-        //
-        // offload layers
-        //
-        // TODO: this code will be obsoleted with backend v2
-
-#ifdef GGML_USE_CUBLAS
-        const bool do_offload = true;
-#else
-        const bool do_offload = true; // TODO: set to false after finishing refactoring
-#endif
-
-        if (!do_offload) {
+        // view tensors are not processed further
+        if (cur->view_src != nullptr) {
            return;
        }

-        // view tensors are not offloaded
-        if (cur->view_src != nullptr) {
+        if (cur->op != GGML_OP_NONE) {
+            n_non_view++;
+        }
+
+        //
+        // offload layers
+        //
+        // TODO: will be removed with backend v2
+
+//#define LLAMA_OFFLOAD_DEBUG
+
+        if (!do_offload) {
            return;
        }

@@ -5103,11 +5115,13 @@ static struct ggml_cgraph * llama_build_graph(
        llm_offload_func_e func_e = k_offload_func_trie.find(name);

        if (func_e == OFFLOAD_FUNC_NOP) {
+#ifdef LLAMA_OFFLOAD_DEBUG
            // if a tensor hasn't been offloaded, we warn the user
            if (worst_case) {
                LLAMA_LOG_WARN("%s: %32s: not offloaded (ref: %s)\n", __func__,
                        cur->name, "https://github.com/ggerganov/llama.cpp/pull/3837");
            }
+#endif

            return;
        }
@@ -5170,9 +5184,11 @@ static struct ggml_cgraph * llama_build_graph(
        // apply offload function to the tensor
        func(cur);

+#ifdef LLAMA_OFFLOAD_DEBUG
        if (worst_case) {
            LLAMA_LOG_INFO("%s: %32s: %s\n", __func__, cur->name, k_offload_func_name.at(func_e).c_str());
        }
+#endif
    };

    struct ggml_cgraph * result = NULL;
@@ -5214,6 +5230,29 @@ static struct ggml_cgraph * llama_build_graph(
            GGML_ASSERT(false);
    }

+    if (worst_case) {
+        int n_non_view_total = 0;
+
+        for (int i = 0; i < result->n_nodes; ++i) {
+            if (result->nodes[i]->view_src == nullptr) {
+                n_non_view_total++;
+            }
+        }
+
+        LLAMA_LOG_INFO("%s: non-view tensors processed: %d/%d\n", __func__, n_non_view, n_non_view_total);
+
+#ifdef LLAMA_OFFLOAD_DEBUG
+        if (n_non_view != n_non_view_total) {
+            LLAMA_LOG_WARN("%s: ****************************************************************\n", __func__);
+            LLAMA_LOG_WARN("%s: not all non-view tensors have been processed with a callback\n",     __func__);
+            LLAMA_LOG_WARN("%s: this can indicate an inefficiency in the graph implementation\n",    __func__);
+            LLAMA_LOG_WARN("%s: build with LLAMA_OFFLOAD_DEBUG for more info\n",                     __func__);
+            LLAMA_LOG_WARN("%s: ref: https://github.com/ggerganov/llama.cpp/pull/3837\n",            __func__);
+            LLAMA_LOG_WARN("%s: ****************************************************************\n", __func__);
+        }
+#endif
+    }
+
    return result;
 }