diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index f601f3277a..db837a2cbb 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6673,17 +6673,16 @@ class FalconH1Model(Mamba2Model): # Add Falcon Mamba2 specific configuration - self.gguf_writer.add_uint32("falcon_h1.attention.head_dim", self.hparams["head_dim"]) - self.gguf_writer.add_uint32("falcon_h1.ssm.mamba_d_inner", self.hparams["mamba_d_ssm"]) + self.gguf_writer.add_ssm_head_dim(self.hparams["mamba_d_head"]) self.gguf_writer.add_ssm_inner_size(self.hparams["mamba_d_ssm"]) - self.gguf_writer.add_uint32("falcon_h1.num_attention_heads", self.find_hparam(["num_attention_heads"])) - self.gguf_writer.add_uint32("falcon_h1.num_key_value_heads", - self.find_hparam(["num_key_value_heads"], optional=True) or - self.find_hparam(["num_attention_heads"])) + self.gguf_writer.add_head_count(self.find_hparam(["num_attention_heads"])) + self.gguf_writer.add_key_length(self.hparams["head_dim"]) + self.gguf_writer.add_value_length(self.hparams["head_dim"]) + self.gguf_writer.add_head_count_kv(self.find_hparam(["num_key_value_heads"], optional=True) or + self.find_hparam(["num_attention_heads"])) # Add any other Falcon Mamba2 specific configuration - self.gguf_writer.add_bool("falcon_h1.mamba_rms_norm", self.find_hparam(["mamba_rms_norm"], optional=True)) self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"])) ###### CONVERSION LOGIC ###### diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index b339872581..b6bd815090 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -128,8 +128,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_EMBEDDING_SCALE, "%s.embedding_scale" }, { LLM_KV_TOKEN_SHIFT_COUNT, "%s.token_shift_count" }, { LLM_KV_INTERLEAVE_MOE_LAYER_STEP, "%s.interleave_moe_layer_step" }, - { LLM_KV_ATTN_HEAD_DIM, "%s.attention.head_dim" }, - + { LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" }, { LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" }, { LLM_KV_ATTENTION_MAX_ALIBI_BIAS, "%s.attention.max_alibi_bias" }, diff --git a/src/llama-arch.h b/src/llama-arch.h index 3b03308b8f..f6ad0fb00a 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -158,7 +158,6 @@ enum llm_kv { LLM_KV_ATTENTION_LAYER_INDICES, // Falcon-H1 specific - LLM_KV_ATTN_HEAD_DIM, LLM_KV_SSM_HEAD_DIM, LLM_KV_N_LAYER, LLM_KV_FALCON_H1_MAMBA_RMS_NORM, diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 140ec5b8e7..18c96a49c0 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -4550,7 +4550,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, ssm_num_heads}, 0); layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, ssm_num_heads}, 0); // ssm_norm - layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {ssm_intermediate_size / ssm_n_groups, ssm_n_groups}, 0); + layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {ssm_intermediate_size / ssm_n_groups, ssm_n_groups}, 1); // out_proj layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {ssm_intermediate_size, hidden_size}, 0);