llama : add API for token type

ggml-ci
This commit is contained in:
Georgi Gerganov
2023-08-21 19:35:31 +03:00
parent 8d177eddeb
commit 0b53b8b08d
6 changed files with 115 additions and 116 deletions

View File

@@ -241,17 +241,19 @@ class BpeVocab:
added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
else:
added_tokens = {}
vocab_size: int = len(self.bpe_tokenizer)
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
actual_ids = sorted(added_tokens.values())
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
actual_ids = sorted(added_tokens.values())
if expected_ids != actual_ids:
raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}")
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
self.added_tokens_list = [text for (text, idx) in items]
self.added_tokens_list = [text for (text, idx) in items]
self.vocab_size_base: int = vocab_size
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
self.fname_tokenizer = fname_tokenizer
self.fname_added_tokens = fname_added_tokens
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
self.fname_tokenizer = fname_tokenizer
self.fname_added_tokens = fname_added_tokens
def bpe_tokens(self) -> Iterable[Tuple[bytes, float]]:
tokenizer = self.bpe_tokenizer
@@ -261,12 +263,12 @@ class BpeVocab:
for i, item in enumerate(tokenizer):
text: bytes = item.encode("utf-8")
score: float = -i
yield text, score, 4
yield text, score, gguf.TokenType.USER_DEFINED
def added_tokens(self) -> Iterable[Tuple[bytes, float]]:
for text in self.added_tokens_list:
score = -1000.0
yield text.encode("utf-8"), score, 4
yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
yield from self.bpe_tokens()
@@ -304,27 +306,27 @@ class SentencePieceVocab:
text: bytes = piece.encode("utf-8")
score: float = tokenizer.get_score(i)
toktype = 1 # defualt to normal token type
toktype = gguf.TokenType.NORMAL
if tokenizer.is_unknown(i):
toktype = 2
toktype = gguf.TokenType.UNKNOWN
if tokenizer.is_control(i):
toktype = 3
toktype = gguf.TokenType.CONTROL
# NOTE: I think added_tokens are user defined.
# ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
# if tokenizer.is_user_defined(i): toktype = 4
# if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED
if tokenizer.is_unused(i):
toktype = 5
toktype = gguf.TokenType.UNUSED
if tokenizer.is_byte(i):
toktype = 6
toktype = gguf.TokenType.BYTE
yield text, score, toktype
def added_tokens(self) -> Iterable[Tuple[bytes, float]]:
for text in self.added_tokens_list:
score = -1000.0
yield text.encode("utf-8"), score, 4
yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
def all_tokens(self) -> Iterable[Tuple[bytes, float]]:
yield from self.sentencepiece_tokens()
@@ -725,6 +727,7 @@ class OutputFile:
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
def add_meta_arch(self, params: Params) -> None:
self.gguf.add_name ("llama")
self.gguf.add_context_length (params.n_ctx)
self.gguf.add_embedding_length (params.n_embd)
self.gguf.add_block_count (params.n_layer)