mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-10-28 08:31:25 +00:00
graph : support non-contiguous Q in build_attn_mha (#15908)
* support non-contiguous Q in build_attn_mha * Update src/llama-graph.cpp ggml-ci Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
@@ -1273,7 +1273,7 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
|||||||
// split the batch into streams if needed
|
// split the batch into streams if needed
|
||||||
const auto n_stream = k->ne[3];
|
const auto n_stream = k->ne[3];
|
||||||
|
|
||||||
q = ggml_reshape_4d(ctx0, q, q->ne[0], q->ne[1], q->ne[2]/n_stream, n_stream);
|
q = ggml_view_4d(ctx0, q, q->ne[0], q->ne[1], q->ne[2]/n_stream, n_stream, q->nb[1], q->nb[2], q->nb[3]/n_stream, 0);
|
||||||
|
|
||||||
q = ggml_permute(ctx0, q, 0, 2, 1, 3);
|
q = ggml_permute(ctx0, q, 0, 2, 1, 3);
|
||||||
k = ggml_permute(ctx0, k, 0, 2, 1, 3);
|
k = ggml_permute(ctx0, k, 0, 2, 1, 3);
|
||||||
|
|||||||
Reference in New Issue
Block a user