mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-11-12 10:47:01 +00:00
Merge branch 'master' into gg/llama-kv-cache
ggml-ci
This commit is contained in:
@@ -1213,5 +1213,7 @@ void llama_grammar_accept_str(struct llama_grammar & grammar, const std::string
|
||||
}
|
||||
|
||||
grammar.partial_utf8 = decoded.second;
|
||||
GGML_ASSERT(!grammar.stacks.empty());
|
||||
if (grammar.stacks.empty()) {
|
||||
throw std::runtime_error("Unexpected empty grammar stack after accepting piece: " + piece);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3868,7 +3868,8 @@ struct llm_build_context {
|
||||
ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
|
||||
cb(k_pe, "k_pe", il);
|
||||
|
||||
kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm
|
||||
// TODO: the CUDA backend used to not support non-cont. (RMS) norm, investigate removing ggml_cont
|
||||
kv_compressed = ggml_cont(ctx0, kv_compressed);
|
||||
kv_compressed = build_norm(kv_compressed,
|
||||
model.layers[il].attn_kv_a_norm, NULL,
|
||||
LLM_NORM_RMS, il);
|
||||
@@ -5702,7 +5703,8 @@ struct llm_build_context {
|
||||
ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
|
||||
cb(k_pe, "k_pe", il);
|
||||
|
||||
kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm
|
||||
// TODO: the CUDA backend used to not support non-cont. (RMS) norm, investigate removing ggml_cont
|
||||
kv_compressed = ggml_cont(ctx0, kv_compressed);
|
||||
kv_compressed = build_norm(kv_compressed,
|
||||
model.layers[il].attn_kv_a_norm, NULL,
|
||||
LLM_NORM_RMS, il);
|
||||
|
||||
Reference in New Issue
Block a user