mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-11-18 11:46:58 +00:00
CUDA: fuse rope + set_rows (#16884)
* CUDA: add fused rope * move k forward_expand up * create helper function instead of re-using params * make assert statement more in line with comment * rope_norm: coalesced writes to global mem
This commit is contained in:
@@ -1592,9 +1592,10 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||
int il) const {
|
||||
// these nodes are added to the graph together so that they are not reordered
|
||||
// by doing so, the number of splits in the graph is reduced
|
||||
// expand k later to enable rope fusion which directly writes into k-v cache
|
||||
ggml_build_forward_expand(gf, q_cur);
|
||||
ggml_build_forward_expand(gf, k_cur);
|
||||
ggml_build_forward_expand(gf, v_cur);
|
||||
ggml_build_forward_expand(gf, k_cur);
|
||||
|
||||
const auto * mctx_cur = inp->mctx;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user