mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-11-18 11:46:58 +00:00
model : add grok-2 support (#15539)
* add grok-2 support * type fix * type fix * type fix * "fix" vocab for invalid sequences * fix expert tensor mapping and spaces in vocab * add chat template * fix norm tensor mapping * rename layer_out_norm to ffn_post_norm * ensure ffn_post_norm is mapped * fix experts merging * remove erroneous FFN_GATE entry * concatenate split tensors and add more metadata * process all expert layers and try cat instead of hstack * add support for community BPE vocab * fix expert feed forward length and ffn_down concat * commit this too * add ffn_up/gate/down, unsure if sequence is right * add ffn_gate/down/up to tensor names * correct residual moe (still not working) * mess-- * fix embedding scale being applied twice * add built in chat template * change beta fast for grok if default value * remove spm vocab in favor of community bpe vocab * change attention temp length metadata type to integer * update attention temp length metadata * remove comment * replace M_SQRT2 with std::sqrt(2) * add yarn metadata, move defaults to hparams
This commit is contained in:
@@ -733,6 +733,9 @@ class GGUFWriter:
|
||||
def add_attn_logit_softcapping(self, value: float) -> None:
|
||||
self.add_float32(Keys.LLM.ATTN_LOGIT_SOFTCAPPING.format(arch=self.arch), value)
|
||||
|
||||
def add_router_logit_softcapping(self, value: float) -> None:
|
||||
self.add_float32(Keys.LLM.ROUTER_LOGIT_SOFTCAPPING.format(arch=self.arch), value)
|
||||
|
||||
def add_final_logit_softcapping(self, value: float) -> None:
|
||||
self.add_float32(Keys.LLM.FINAL_LOGIT_SOFTCAPPING.format(arch=self.arch), value)
|
||||
|
||||
@@ -829,6 +832,12 @@ class GGUFWriter:
|
||||
def add_attention_scale(self, value: float) -> None:
|
||||
self.add_float32(Keys.Attention.SCALE.format(arch=self.arch), value)
|
||||
|
||||
def add_attn_output_scale(self, value: float) -> None:
|
||||
self.add_float32(Keys.Attention.OUTPUT_SCALE.format(arch=self.arch), value)
|
||||
|
||||
def add_attn_temperature_length(self, value: int) -> None:
|
||||
self.add_uint32(Keys.Attention.TEMPERATURE_LENGTH.format(arch=self.arch), value)
|
||||
|
||||
def add_pooling_type(self, value: PoolingType) -> None:
|
||||
self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value)
|
||||
|
||||
@@ -859,6 +868,18 @@ class GGUFWriter:
|
||||
def add_rope_scaling_yarn_log_mul(self, value: float) -> None:
|
||||
self.add_float32(Keys.Rope.SCALING_YARN_LOG_MUL.format(arch=self.arch), value)
|
||||
|
||||
def add_rope_scaling_yarn_ext_factor(self, value: float) -> None:
|
||||
self.add_float32(Keys.Rope.SCALING_YARN_EXT_FACTOR.format(arch=self.arch), value)
|
||||
|
||||
def add_rope_scaling_yarn_attn_factor(self, value: float) -> None:
|
||||
self.add_float32(Keys.Rope.SCALING_YARN_ATTN_FACTOR.format(arch=self.arch), value)
|
||||
|
||||
def add_rope_scaling_yarn_beta_fast(self, value: float) -> None:
|
||||
self.add_float32(Keys.Rope.SCALING_YARN_BETA_FAST.format(arch=self.arch), value)
|
||||
|
||||
def add_rope_scaling_yarn_beta_slow(self, value: float) -> None:
|
||||
self.add_float32(Keys.Rope.SCALING_YARN_BETA_SLOW.format(arch=self.arch), value)
|
||||
|
||||
def add_ssm_conv_kernel(self, value: int) -> None:
|
||||
self.add_uint32(Keys.SSM.CONV_KERNEL.format(arch=self.arch), value)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user