mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-11-01 09:01:57 +00:00
ggml-cuda : add rope f16, restore performance with parallel decoding (#3272)
* ggml-cuda : add rope f16, restore performance * offload KQ_mask with all models * fix rope shift --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
2
ggml.c
2
ggml.c
@@ -6343,7 +6343,7 @@ static struct ggml_tensor * ggml_cpy_impl(
|
||||
}
|
||||
|
||||
// make a view of the destination
|
||||
struct ggml_tensor * result = ggml_view_tensor(ctx, b);
|
||||
struct ggml_tensor * result = b->op == GGML_OP_NONE ? b : ggml_view_tensor(ctx, b);
|
||||
if (strlen(b->name) > 0) {
|
||||
ggml_format_name(result, "%s (copy of %s)", b->name, a->name);
|
||||
} else {
|
||||
|
||||
Reference in New Issue
Block a user