mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	llama : support RWKV v6 models (#8980)
* convert_hf_to_gguf: Add support for RWKV v6 Signed-off-by: Molly Sophia <mollysophia379@gmail.com> * Add RWKV tokenization * Fix build Signed-off-by: Molly Sophia <mollysophia379@gmail.com> * Do not use special tokens when matching in RWKV tokenizer * Fix model loading * Add (broken) placeholder graph builder for RWKV * Add workaround for kv cache * Add logits conversion to rwkv5 * Add rwkv5 layer norms * Add time mix KVRG & correct merge mistake * Add remaining time mix parameters * Add time mix output loading * Add placeholder llm_build_time_mix * Fix build Signed-off-by: Molly Sophia <mollysophia379@gmail.com> * Load more tensors for rwkv v6 Signed-off-by: Molly Sophia <mollysophia379@gmail.com> * Fix rwkv tokenizer Signed-off-by: Molly Sophia <mollysophia379@gmail.com> * ggml: Add unary operator Exp Signed-off-by: Molly Sophia <mollysophia379@gmail.com> * RWKV v6 graph building Signed-off-by: Molly Sophia <mollysophia379@gmail.com> * Add ``rescale_every_n_layers`` parameter Signed-off-by: Molly Sophia <mollysophia379@gmail.com> * Add ``wkv.head_size`` key for RWKV so it doesn't reuse Mamba ssm parameters Signed-off-by: Molly Sophia <mollysophia379@gmail.com> * Fix offloading layers to CUDA Signed-off-by: Molly Sophia <mollysophia379@gmail.com> * Fix parallel inferencing for RWKV Signed-off-by: Molly Sophia <mollysophia379@gmail.com> * Remove trailing whitespaces Signed-off-by: Molly Sophia <mollysophia379@gmail.com> * build_rwkv: Avoid using inplace operations Signed-off-by: Molly Sophia <mollysophia379@gmail.com> * convert_hf_to_gguf: rwkv: Avoid using ``eval`` Signed-off-by: Molly Sophia <mollysophia379@gmail.com> * convert_hf_to_gguf: rwkv tokenizer: Don't escape sequences manually Signed-off-by: Molly Sophia <mollysophia379@gmail.com> * Update convert_hf_to_gguf.py Co-authored-by: compilade <git@compilade.net> * ggml: Add backward computation for unary op ``exp`` Signed-off-by: Molly Sophia <mollysophia379@gmail.com> * Update convert_hf_to_gguf.py Co-authored-by: compilade <git@compilade.net> * Update convert_hf_to_gguf.py Co-authored-by: compilade <git@compilade.net> * Use MODEL_ARCH.RWKV6 instead of MODEL_ARCH.RWKV Signed-off-by: Molly Sophia <mollysophia379@gmail.com> * build_rwkv6: Simplify graph Signed-off-by: Molly Sophia <mollysophia379@gmail.com> * llama: rwkv6: Detect model.type Signed-off-by: Molly Sophia <mollysophia379@gmail.com> * llama: rwkv6: Fix tensor loading for 7B/14B models Signed-off-by: Molly Sophia <mollysophia379@gmail.com> * llama: rwkv6: Fix group_norm assertion failure with Metal Signed-off-by: Molly Sophia <mollysophia379@gmail.com> * llama: rwkv6: Clean up Signed-off-by: Molly Sophia <mollysophia379@gmail.com> * llama: rwkv6: Add quantization tensor exclusion Signed-off-by: Molly Sophia <mollysophia379@gmail.com> * llama: rwkv6: Use the new advanced batch splits Signed-off-by: Molly Sophia <mollysophia379@gmail.com> * Update src/llama.cpp Co-authored-by: compilade <git@compilade.net> * llama: rwkv6: Use ``ggml_norm`` instead of ``ggml_group_norm`` Co-authored-by: compilade <git@compilade.net> * llama: rwkv6: Apply code style and misc changes Signed-off-by: Molly Sophia <mollysophia379@gmail.com> * converter: Use class name ``Rwkv6Model`` Signed-off-by: Molly Sophia <mollysophia379@gmail.com> * llama: rwkv6: Make use of key ``feed_forward_length`` Signed-off-by: Molly Sophia <mollysophia379@gmail.com> * llama: rwkv6: Add kv ``time_mix_extra_dim`` and ``time_decay_extra_dim`` Signed-off-by: Molly Sophia <mollysophia379@gmail.com> * converter: Match ``new_name`` instead of ``name`` for float32 explicit tensors Signed-off-by: Molly Sophia <mollysophia379@gmail.com> * llama: rwkv6: Keep ``time_mix_w1/w2`` as F32 Signed-off-by: Molly Sophia <mollysophia379@gmail.com> * llama: rwkv6: Remove unused nodes Signed-off-by: Molly Sophia <mollysophia379@gmail.com> * llama: rwkv6: Apply code format changes Signed-off-by: Molly Sophia <mollysophia379@gmail.com> * llama: rwkv6: Add lora for some supported tensors Currently att.key/receptance/value/gate/output, ffn.receptance/key/value, as well as head.weight Signed-off-by: Molly Sophia <mollysophia379@gmail.com> * rwkv : speed-up tokenization using trie * minor : style + indentation * llama: rwkv6: Avoid division by zero Co-authored-by: compilade <git@compilade.net> * ggml: rwkv_wkv: Avoid copying the state Signed-off-by: Molly Sophia <mollysophia379@gmail.com> --------- Signed-off-by: Molly Sophia <mollysophia379@gmail.com> Co-authored-by: Layl Bongers <3094382+LaylBongers@users.noreply.github.com> Co-authored-by: compilade <git@compilade.net> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
		| @@ -3,6 +3,7 @@ | ||||
|  | ||||
| from __future__ import annotations | ||||
|  | ||||
| import ast | ||||
| import logging | ||||
| import argparse | ||||
| import contextlib | ||||
| @@ -298,9 +299,12 @@ class Model: | ||||
|                             gguf.MODEL_TENSOR.POS_EMBD, | ||||
|                             gguf.MODEL_TENSOR.TOKEN_TYPES, | ||||
|                             gguf.MODEL_TENSOR.SSM_CONV1D, | ||||
|                             gguf.MODEL_TENSOR.TIME_MIX_FIRST, | ||||
|                             gguf.MODEL_TENSOR.TIME_MIX_W1, | ||||
|                             gguf.MODEL_TENSOR.TIME_MIX_W2, | ||||
|                         ) | ||||
|                     ) | ||||
|                     or not name.endswith(".weight") | ||||
|                     or not new_name.endswith(".weight") | ||||
|                 ): | ||||
|                     data_qtype = gguf.GGMLQuantizationType.F32 | ||||
|  | ||||
| @@ -2716,6 +2720,84 @@ class StarCoder2Model(Model): | ||||
|     model_arch = gguf.MODEL_ARCH.STARCODER2 | ||||
|  | ||||
|  | ||||
| @Model.register("Rwkv6ForCausalLM") | ||||
| class Rwkv6Model(Model): | ||||
|     model_arch = gguf.MODEL_ARCH.RWKV6 | ||||
|  | ||||
|     def set_vocab(self): | ||||
|         assert (self.dir_model / "rwkv_vocab_v20230424.txt").is_file() | ||||
|         vocab_size = self.hparams.get("vocab_size", 65536) | ||||
|  | ||||
|         tokens: list[bytes] = ['<s>'.encode("utf-8")] | ||||
|         toktypes: list[int] = [gguf.TokenType.CONTROL] | ||||
|  | ||||
|         with open(self.dir_model / "rwkv_vocab_v20230424.txt", "r", encoding="utf-8") as f: | ||||
|             lines = f.readlines() | ||||
|             for line in lines: | ||||
|                 parts = line.split(' ') | ||||
|                 assert len(parts) >= 3 | ||||
|                 token, token_len = ast.literal_eval(' '.join(parts[1:-1])), int(parts[-1]) | ||||
|                 token = token.encode("utf-8") if isinstance(token, str) else token | ||||
|                 assert isinstance(token, bytes) | ||||
|                 assert len(token) == token_len | ||||
|                 token_text: str = repr(token)[2:-1]  # "b'\xff'" -> "\xff" | ||||
|                 tokens.append(token_text.encode("utf-8")) | ||||
|                 toktypes.append(gguf.TokenType.NORMAL) | ||||
|         remainder = vocab_size - len(tokens) | ||||
|         assert remainder >= 0 | ||||
|         for i in range(len(tokens), vocab_size): | ||||
|             tokens.append(f"[PAD{i}]".encode("utf-8")) | ||||
|             toktypes.append(gguf.TokenType.UNUSED) | ||||
|  | ||||
|         self.gguf_writer.add_tokenizer_model("rwkv") | ||||
|         self.gguf_writer.add_token_list(tokens) | ||||
|         self.gguf_writer.add_token_types(toktypes) | ||||
|  | ||||
|     def set_gguf_parameters(self): | ||||
|         block_count = self.hparams["num_hidden_layers"] | ||||
|         head_size = self.hparams["head_size"] | ||||
|         hidden_size = self.hparams["hidden_size"] | ||||
|         layer_norm_eps = self.hparams["layer_norm_epsilon"] | ||||
|         rescale_every_n_layers = self.hparams["rescale_every"] | ||||
|         intermediate_size = self.hparams["intermediate_size"] if self.hparams["intermediate_size"] is not None else int((hidden_size * 3.5) // 32 * 32) | ||||
|         time_mix_extra_dim = 64 if hidden_size == 4096 else 32 | ||||
|         time_decay_extra_dim = 128 if hidden_size == 4096 else 64 | ||||
|  | ||||
|         # RWKV isn't context limited | ||||
|         self.gguf_writer.add_context_length(1048576) | ||||
|         self.gguf_writer.add_embedding_length(hidden_size) | ||||
|         self.gguf_writer.add_block_count(block_count) | ||||
|         self.gguf_writer.add_layer_norm_eps(layer_norm_eps) | ||||
|         self.gguf_writer.add_rescale_every_n_layers(rescale_every_n_layers) | ||||
|         self.gguf_writer.add_wkv_head_size(head_size) | ||||
|         self.gguf_writer.add_time_mix_extra_dim(time_mix_extra_dim) | ||||
|         self.gguf_writer.add_time_decay_extra_dim(time_decay_extra_dim) | ||||
|         self.gguf_writer.add_feed_forward_length(intermediate_size) | ||||
|         self.gguf_writer.add_file_type(self.ftype) | ||||
|  | ||||
|         # required by llama.cpp, unused | ||||
|         self.gguf_writer.add_head_count(0) | ||||
|  | ||||
|     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: | ||||
|         new_name = self.map_tensor_name(name) | ||||
|  | ||||
|         if not (new_name.endswith(".weight") or new_name.endswith(".bias")): | ||||
|             new_name += ".weight" | ||||
|  | ||||
|         if new_name.endswith("time_mix_w1.weight") or new_name.endswith("time_mix_decay_w1.weight") or new_name.endswith("time_mix_decay_w2.weight"): | ||||
|             data_torch = data_torch.transpose(0, 1) | ||||
|  | ||||
|         if new_name.endswith("time_mix_w2.weight"): | ||||
|             data_torch = data_torch.permute(0, 2, 1) | ||||
|  | ||||
|         rescale_every_n_layers = self.hparams["rescale_every"] | ||||
|         if rescale_every_n_layers > 0: | ||||
|             if new_name.endswith("time_mix_output.weight") or new_name.endswith("channel_mix_value.weight"): | ||||
|                 data_torch = data_torch.div_(2 ** int(bid // rescale_every_n_layers)) | ||||
|  | ||||
|         yield (new_name, data_torch) | ||||
|  | ||||
|  | ||||
| @Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM") | ||||
| class MambaModel(Model): | ||||
|     model_arch = gguf.MODEL_ARCH.MAMBA | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Molly Sophia
					Molly Sophia