llama : add llm_build_inp_embd helper

2025-10-30 08:42:00 +00:00 · 2023-10-31 16:43:08 +02:00
parent 2073347e3b
commit 7923b70cb8
1 changed files with 50 additions and 111 deletions
--- a/llama.cpp
+++ b/llama.cpp
@@ -1228,8 +1228,8 @@ struct llama_model {
    llama_hparams hparams = {};
    llama_vocab   vocab;

-    struct ggml_tensor * tok_embeddings;
-    struct ggml_tensor * pos_embeddings;
+    struct ggml_tensor * tok_embd;
+    struct ggml_tensor * pos_embd;
    struct ggml_tensor * tok_norm;
    struct ggml_tensor * tok_norm_b;

@@ -2484,7 +2484,7 @@ static void llm_load_tensors(
            case LLM_ARCH_LLAMA:
            case LLM_ARCH_REFACT:
                {
-                    model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
+                    model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);

                    // output
                    {
@@ -2552,7 +2552,7 @@ static void llm_load_tensors(
                } break;
            case LLM_ARCH_BAICHUAN:
                {
-                    model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
+                    model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
                    {
                        ggml_backend_type backend_norm;
                        ggml_backend_type backend_output;
@@ -2620,7 +2620,7 @@ static void llm_load_tensors(
                {
                    // TODO: CPU-only for now

-                    model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
+                    model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);

                    // output
                    {
@@ -2696,8 +2696,8 @@ static void llm_load_tensors(
                } break;
            case LLM_ARCH_STARCODER:
                {
-                    model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab},             GGML_BACKEND_CPU);
-                    model.pos_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"),   {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU);
+                    model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab},             GGML_BACKEND_CPU);
+                    model.pos_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"),   {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU);

                    // output
                    {
@@ -2775,7 +2775,7 @@ static void llm_load_tensors(
                } break;
            case LLM_ARCH_PERSIMMON:
                {
-                    model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"),  {n_embd, n_vocab}, GGML_BACKEND_CPU);
+                    model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"),  {n_embd, n_vocab}, GGML_BACKEND_CPU);

                    {
                        ggml_backend_type backend_norm;
@@ -2838,7 +2838,7 @@ static void llm_load_tensors(
                {
                    // TODO: CPU-only for now

-                    model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD,      "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
+                    model.tok_embd   = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD,      "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
                    model.tok_norm   = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd},          GGML_BACKEND_CPU);
                    model.tok_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"),   {n_embd},          GGML_BACKEND_CPU);

@@ -2918,7 +2918,7 @@ static void llm_load_tensors(
                } break;
            case LLM_ARCH_MPT:
                {
-                    model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
+                    model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);

                    // output
                    {
@@ -3099,6 +3099,31 @@ enum llm_rope_type {
    LLM_ROPE_GLM,
 };

+static struct ggml_tensor * llm_build_inp_embd(
+        struct ggml_context * ctx,
+          const llama_batch & batch,
+         struct ggml_tensor * tok_embd,
+                    int64_t   n_embd,
+                    int32_t   n_tokens,
+         const llm_build_cb & cb) {
+    struct ggml_tensor * inpL;
+
+    if (batch.token) {
+        struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_tokens);
+        cb(inp_tokens, "inp_tokens", -1);
+
+        inpL = ggml_get_rows(ctx, tok_embd, inp_tokens);
+    } else {
+#ifdef GGML_USE_MPI
+        GGML_ASSERT(false && "not implemented");
+#endif
+
+        inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_tokens);
+    }
+
+    return inpL;
+}
+
 // Persimmon: n_rot = n_embd_head/2
 // Other:     n_rot = n_embd_head
 static void llm_build_k_shift(
@@ -3463,18 +3488,7 @@ static struct ggml_cgraph * llm_build_llama(
    struct ggml_tensor * cur;
    struct ggml_tensor * inpL;

-    if (batch.token) {
-        struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-        cb(inp_tokens, "inp_tokens", -1);
-
-        inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
-    } else {
-#ifdef GGML_USE_MPI
-        GGML_ASSERT(false && "not implemented");
-#endif
-
-        inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
-    }
+    inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb);
    cb(inpL, "inp_embd", -1);

    // inp_pos - contains the positions
@@ -3619,18 +3633,7 @@ static struct ggml_cgraph * llm_build_baichaun(
    struct ggml_tensor * cur;
    struct ggml_tensor * inpL;

-    if (batch.token) {
-        struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-        cb(inp_tokens, "inp_tokens", -1);
-
-        inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
-    } else {
-#ifdef GGML_USE_MPI
-        GGML_ASSERT(false && "not implemented");
-#endif
-
-        inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
-    }
+    inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb);
    cb(inpL, "inp_embd", -1);

    // inp_pos - contains the positions
@@ -3789,18 +3792,7 @@ static struct ggml_cgraph * llm_build_falcon(
    struct ggml_tensor * cur;
    struct ggml_tensor * inpL;

-    if (batch.token) {
-        struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-        cb(inp_tokens, "inp_tokens", -1);
-
-        inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
-    } else {
-#ifdef GGML_USE_MPI
-        GGML_ASSERT(false && "not implemented");
-#endif
-
-        inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
-    }
+    inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb);
    cb(inpL, "inp_embd", -1);

    // inp_pos - contains the positions
@@ -3953,23 +3945,11 @@ static struct ggml_cgraph * llm_build_starcoder(
    ggml_cgraph * gf = ggml_new_graph(ctx0);

    struct ggml_tensor * cur;
-    struct ggml_tensor * embd;
    struct ggml_tensor * pos;
    struct ggml_tensor * inpL;

-    if (batch.token) {
-        struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-        cb(inp_tokens, "inp_tokens", -1);
-
-        embd = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
-    } else {
-#ifdef GGML_USE_MPI
-        GGML_ASSERT(false && "not implemented");
-#endif
-
-        embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
-    }
-    cb(embd, "inp_embd", -1);
+    inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb);
+    cb(inpL, "inp_embd", -1);

    // inp_pos - contains the positions
    struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
@@ -3983,10 +3963,10 @@ static struct ggml_cgraph * llm_build_starcoder(
    struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
    cb(KQ_mask, "KQ_mask", -1);

-    pos = ggml_get_rows(ctx0, model.pos_embeddings, inp_pos);
+    pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
    cb(pos, "pos_embd", -1);

-    inpL = ggml_add(ctx0, embd, pos);
+    inpL = ggml_add(ctx0, inpL, pos);
    cb(inpL, "inpL", -1);

    for (int il = 0; il < n_layer; ++il) {
@@ -4108,14 +4088,7 @@ static struct ggml_cgraph * llm_build_persimmon(
    struct ggml_tensor * cur;
    struct ggml_tensor * inpL;

-    if (batch.token) {
-        struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-        cb(inp_tokens, "inp_tokens", -1);
-
-        inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
-    } else {
-        inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
-    }
+    inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb);
    cb(inpL, "imp_embd", -1);

    struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
@@ -4358,18 +4331,7 @@ static struct ggml_cgraph * llm_build_refact(
    struct ggml_tensor * cur;
    struct ggml_tensor * inpL;

-    if (batch.token) {
-        struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-        cb(inp_tokens, "inp_tokens", -1);
-
-        inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
-    } else {
-#ifdef GGML_USE_MPI
-        GGML_ASSERT(false && "not implemented");
-#endif
-
-        inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
-    }
+    inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb);
    cb(inpL, "inp_embd", -1);

    // KQ_scale
@@ -4499,22 +4461,10 @@ static struct ggml_cgraph * llm_build_bloom(
    ggml_cgraph * gf = ggml_new_graph(ctx0);

    struct ggml_tensor * cur;
-    struct ggml_tensor * embd;
    struct ggml_tensor * inpL;

-    if (batch.token) {
-        struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-        cb(inp_tokens, "inp_tokens", -1);
-
-        embd = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
-    } else {
-#ifdef GGML_USE_MPI
-        GGML_ASSERT(false && "not implemented");
-#endif
-
-        embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
-    }
-    cb(embd, "inp_embd", -1);
+    inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb);
+    cb(inpL, "inp_embd", -1);

    // KQ_scale
    struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
@@ -4524,7 +4474,7 @@ static struct ggml_cgraph * llm_build_bloom(
    struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
    cb(KQ_mask, "KQ_mask", -1);

-    inpL = llm_build_norm(ctx0, embd,
+    inpL = llm_build_norm(ctx0, inpL,
            model.tok_norm,
            model.tok_norm_b,
            LLM_NORM, norm_eps, cb, -1);
@@ -4648,18 +4598,7 @@ static struct ggml_cgraph * llm_build_mpt(
    struct ggml_tensor * cur;
    struct ggml_tensor * inpL;

-    if (batch.token) {
-        struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
-        cb(inp_tokens, "inp_tokens", -1);
-
-        inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
-    } else {
-#ifdef GGML_USE_MPI
-        GGML_ASSERT(false && "not implemented");
-#endif
-
-        inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
-    }
+    inpL = llm_build_inp_embd(ctx0, batch, model.tok_embd, n_embd, n_tokens, cb);
    cb(inpL, "inp_embd", -1);

    // KQ_scale