From 1d0125bcf1cbd7195ad0faf826a20bc7cec7d3f4 Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Mon, 22 Sep 2025 12:40:10 -0600 Subject: [PATCH] feat: Add conversion support in GraniteHybrid for non-hybrid (all attn) (#16177) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a configuration of the hparams in the GraniteHybrid architecture that devolves to the Granite (or GraniteMoe) architecture (ie Granite 3.x). It may be used for some models in the Granite 4 family with the GraniteHybrid architecture acting as a superset arch. Rather than support it directly in the c++ graph, we simply coerce the architecture flag back to the correct "granite" or "granitemoe" architecture. Branch: gabe-l-hart/GraniteNonHybridConversion Signed-off-by: Gabe Goodhart Co-authored-by: Sigbjørn Skjæret --- convert_hf_to_gguf.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 7ddec48ad7..9ebd8567ad 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -7656,6 +7656,21 @@ class GraniteHybridModel(Mamba2Model, GraniteMoeModel): if i not in self._attn_layers ] + # There are some models in this family that are non-hybrid, but keep the + # same parent class by setting all layers to "attention." If this is the + # case, the model architecture needs to be updated to a standard + # "granite" or "granitemoe" model + if not self._ssm_layers: + has_experts = self.find_hparam(["num_experts_per_tok"], optional=True) + new_arch = ( + gguf.MODEL_ARCH.GRANITE_MOE + if has_experts else + gguf.MODEL_ARCH.GRANITE + ) + self.model_arch = new_arch + self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[new_arch] + self.gguf_writer.add_architecture() + # n_group and d_inner are used during reshape_tensors for mamba2 # NOTE: Explicitly include hparam prefix prefix for d_model to # disambiguate with top-level head_dim @@ -7740,8 +7755,11 @@ class GraniteHybridModel(Mamba2Model, GraniteMoeModel): self.gguf_writer.add_rope_dimension_count(rope_dim) self.gguf_writer.add_head_count_kv(head_count_kv_vec) - ## If Bamba, use rope, otherwise don't - use_rope = "BambaForCausalLM" in self.hparams["architectures"] + ## If Bamba or non-hybrid, use rope, otherwise don't + use_rope = ( + "BambaForCausalLM" in self.hparams["architectures"] + or not self._ssm_layers + ) self.gguf_writer.add_rope_scaling_finetuned(use_rope) if not use_rope: self.gguf_writer.add_context_length(2**20)