kv-cache : fix SWA checks + disable cacheless iSWA (#15811)

ggml-ci
This commit is contained in:
Georgi Gerganov
2025-09-05 10:39:22 +03:00
committed by GitHub
parent 5d6688de08
commit c610b6c11b
9 changed files with 29 additions and 11 deletions

View File

@@ -297,6 +297,9 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
float * data = (float *) kq_mask->data;
// [TAG_NO_CACHE_ISWA]
GGML_ASSERT(hparams.swa_type == LLAMA_SWA_TYPE_NONE && "TODO: implement");
for (int h = 0; h < 1; ++h) {
for (int i1 = 0; i1 < n_tokens; ++i1) {
const llama_seq_id s1 = ubatch->seq_id[i1][0];
@@ -315,9 +318,10 @@ void llm_graph_input_attn_no_cache::set_input(const llama_ubatch * ubatch) {
continue; // skip future tokens for causal attention
}
if (hparams.is_masked_swa(ubatch->pos[i0], ubatch->pos[i1])) {
continue; // skip masked tokens for SWA
}
// TODO: this does not take into account that some layers are SWA and others are note (i.e. iSWA) [TAG_NO_CACHE_ISWA]
//if (hparams.is_masked_swa(ubatch->pos[i0], ubatch->pos[i1])) {
// continue; // skip masked tokens for SWA
//}
// TODO: reimplement this like in llama_kv_cache_unified
if (hparams.use_alibi) {