ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)

* ggml-cuda : add rope f16, restore performance * offload KQ_mask with all models * fix rope shift --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2025-11-01 09:01:57 +00:00 · 2023-09-20 13:00:28 +02:00
parent db0fc2da06
commit e04dc51988
4 changed files with 110 additions and 67 deletions
--- a/ggml.c
+++ b/ggml.c
@@ -6343,7 +6343,7 @@ static struct ggml_tensor * ggml_cpy_impl(
    }

    // make a view of the destination
-    struct ggml_tensor * result = ggml_view_tensor(ctx, b);
+    struct ggml_tensor * result = b->op == GGML_OP_NONE ? b : ggml_view_tensor(ctx, b);
    if (strlen(b->name) > 0) {
        ggml_format_name(result, "%s (copy of %s)", b->name, a->name);
    } else {