mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-04 09:32:00 +00:00 
			
		
		
		
	* merged the changes from deepseeker models to main branch
* Moved regex patterns to unicode.cpp and updated unicode.h
* Moved header files
* Resolved issues
* added and refactored unicode_regex_split and related functions
* Updated/merged the deepseek coder pr
* Refactored code
* Adding unicode regex mappings
* Adding unicode regex function
* Added needed functionality, testing remains
* Fixed issues
* Fixed issue with gpt2 regex custom preprocessor
* unicode : fix? unicode_wstring_to_utf8
* lint : fix whitespaces
* tests : add tokenizer tests for numbers
* unicode : remove redundant headers
* tests : remove and rename tokenizer test scripts
* tests : add sample usage
* gguf-py : reader prints warnings on duplicate keys
* llama : towards llama3 tokenization support (wip)
* unicode : shot in the dark to fix tests on Windows
* unicode : first try custom implementations
* convert : add "tokenizer.ggml.pre" GGUF KV (wip)
* llama : use new pre-tokenizer type
* convert : fix pre-tokenizer type writing
* lint : fix
* make : add test-tokenizer-0-llama-v3
* wip
* models : add llama v3 vocab file
* llama : adapt punctuation regex + add llama 3 regex
* minor
* unicode : set bomb
* unicode : set bomb
* unicode : always use std::wregex
* unicode : support \p{N}, \p{L} and \p{P} natively
* unicode : try fix windows
* unicode : category support via std::regex
* unicode : clean-up
* unicode : simplify
* convert : add convert-hf-to-gguf-update.py
ggml-ci
* lint : update
* convert : add falcon
ggml-ci
* unicode : normalize signatures
* lint : fix
* lint : fix
* convert : remove unused functions
* convert : add comments
* convert : exercise contractions
ggml-ci
* lint : fix
* cmake : refactor test targets
* tests : refactor vocab tests
ggml-ci
* tests : add more vocabs and tests
ggml-ci
* unicode : cleanup
* scripts : ignore new update script in check-requirements.sh
* models : add phi-3, mpt, gpt-2, starcoder
* tests : disable obsolete
ggml-ci
* tests : use faster bpe test
ggml-ci
* llama : more prominent warning for old BPE models
* tests : disable test-tokenizer-1-bpe due to slowness
ggml-ci
---------
Co-authored-by: Jaggzh <jaggz.h@gmail.com>
Co-authored-by: Kazim Abrar Mahi <kazimabrarmahi135@gmail.com>
		
	
		
			
				
	
	
		
			118 lines
		
	
	
		
			3.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			118 lines
		
	
	
		
			3.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
# tests with BPE tokenizer
 | 
						||
#
 | 
						||
# sample usage:
 | 
						||
#
 | 
						||
#   python3 tests/test-tokenizer-0-bpe.py ~/Data/huggingface/Meta-Llama-3-8B-Instruct/
 | 
						||
#   python3 tests/test-tokenizer-0-bpe.py ~/Data/huggingface/falcon-7b/
 | 
						||
#   python3 tests/test-tokenizer-0-bpe.py ~/Data/huggingface/deepseek-coder-6.7b-instruct/
 | 
						||
#
 | 
						||
 | 
						||
import argparse
 | 
						||
 | 
						||
from transformers import AutoTokenizer
 | 
						||
 | 
						||
parser = argparse.ArgumentParser()
 | 
						||
parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
 | 
						||
parser.add_argument("--fname-tok",   help="path to a text file to tokenize")
 | 
						||
args = parser.parse_args()
 | 
						||
 | 
						||
dir_tokenizer = args.dir_tokenizer
 | 
						||
 | 
						||
tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer)
 | 
						||
 | 
						||
tests = [
 | 
						||
    "",
 | 
						||
    " ",
 | 
						||
    "  ",
 | 
						||
    "   ",
 | 
						||
    "\t",
 | 
						||
    "\n",
 | 
						||
    "\n\n",
 | 
						||
    "\n\n\n",
 | 
						||
    "\t\n",
 | 
						||
    "Hello world",
 | 
						||
    " Hello world",
 | 
						||
    "Hello World",
 | 
						||
    " Hello World",
 | 
						||
    " Hello World!",
 | 
						||
    "Hello, world!",
 | 
						||
    " Hello, world!",
 | 
						||
    " this is 🦙.cpp",
 | 
						||
    "w048 7tuijk dsdfhu",
 | 
						||
    "нещо на Български",
 | 
						||
    "កាន់តែពិសេសអាចខលចេញ",
 | 
						||
    "🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
 | 
						||
    "Hello",
 | 
						||
    " Hello",
 | 
						||
    "  Hello",
 | 
						||
    "   Hello",
 | 
						||
    "    Hello",
 | 
						||
    "    Hello\n    Hello",
 | 
						||
    " (",
 | 
						||
    "\n =",
 | 
						||
    "' era",
 | 
						||
    "Hello, y'all! How are you 😁 ?我想在apple工作1314151天~",
 | 
						||
    "3",
 | 
						||
    "33",
 | 
						||
    "333",
 | 
						||
    "3333",
 | 
						||
    "33333",
 | 
						||
    "333333",
 | 
						||
    "3333333",
 | 
						||
    "33333333",
 | 
						||
    "333333333",
 | 
						||
]
 | 
						||
 | 
						||
for text in tests:
 | 
						||
    print('text: ', text)
 | 
						||
    print(tokenizer.encode(text))
 | 
						||
    print(tokenizer.decode(tokenizer.encode(text)))
 | 
						||
 | 
						||
print("\n\ntests for C++:\n")
 | 
						||
for text in tests:
 | 
						||
    res = tokenizer.encode(text)
 | 
						||
 | 
						||
    k = text.replace('\n', '\\n')
 | 
						||
    k = k.replace('\t', '\\t')
 | 
						||
    k = '"' + k + '"'
 | 
						||
    print("{ %-24s, { " % k, end='')
 | 
						||
    for x in res:
 | 
						||
        print("%7d," % x, end='')
 | 
						||
    print(" }, },")
 | 
						||
 | 
						||
print(tokenizer.encode('hello'))
 | 
						||
print(tokenizer.encode('world'))
 | 
						||
print(tokenizer.encode(' world'))
 | 
						||
print(tokenizer.encode('hello world'))
 | 
						||
 | 
						||
fname_tok = args.fname_tok
 | 
						||
if fname_tok:
 | 
						||
    print('tokenizing file: ', fname_tok)
 | 
						||
    fname_out = fname_tok + '.tok'
 | 
						||
    with open(fname_tok, 'r', encoding='utf-8') as f:
 | 
						||
        lines = f.readlines()
 | 
						||
        s = ''.join(lines)
 | 
						||
        res = tokenizer.encode(s)
 | 
						||
        # write to file
 | 
						||
        with open(fname_out, 'w', encoding='utf-8') as f:
 | 
						||
            for x in res:
 | 
						||
                # LLaMA v3 for some reason strips the space for these tokens (and others)
 | 
						||
                # if x == 662:
 | 
						||
                #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
 | 
						||
                # elif x == 1174:
 | 
						||
                #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
 | 
						||
                # elif x == 2564:
 | 
						||
                #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
 | 
						||
                # elif x == 758:
 | 
						||
                #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
 | 
						||
                # elif x == 949:
 | 
						||
                #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
 | 
						||
                # elif x == 5354:
 | 
						||
                #     f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
 | 
						||
                # else:
 | 
						||
                #     f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
 | 
						||
                f.write(str(x) + ' \'' + tokenizer.decode(x).strip() + '\'\n')
 | 
						||
        print('len(res): ', len(res))
 | 
						||
        print('len(lines): ', len(lines))
 | 
						||
    print('results written to: ', fname_out)
 |