mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	py : convert-hf-to-gguf-update improvements (#7340)
* convert-hf-to-gguf-update: automate updating * convert-hf-to-gguf-update: improve download * share requests session for performance * create directories only when needed, don't skip downloads when empty directory encountered * be more graceful about errors
This commit is contained in:
		| @@ -20,11 +20,13 @@ | |||||||
| # - Update llama.cpp with the new pre-tokenizer if necessary | # - Update llama.cpp with the new pre-tokenizer if necessary | ||||||
| # | # | ||||||
| # TODO: generate tokenizer tests for llama.cpp | # TODO: generate tokenizer tests for llama.cpp | ||||||
| # TODO: automate the update of convert-hf-to-gguf.py |  | ||||||
| # | # | ||||||
|  |  | ||||||
| import logging | import logging | ||||||
| import os | import os | ||||||
|  | import pathlib | ||||||
|  | import re | ||||||
|  |  | ||||||
| import requests | import requests | ||||||
| import sys | import sys | ||||||
| import json | import json | ||||||
| @@ -35,6 +37,7 @@ from transformers import AutoTokenizer | |||||||
|  |  | ||||||
| logging.basicConfig(level=logging.DEBUG) | logging.basicConfig(level=logging.DEBUG) | ||||||
| logger = logging.getLogger("convert-hf-to-gguf-update") | logger = logging.getLogger("convert-hf-to-gguf-update") | ||||||
|  | sess = requests.Session() | ||||||
|  |  | ||||||
|  |  | ||||||
| class TOKENIZER_TYPE(IntEnum): | class TOKENIZER_TYPE(IntEnum): | ||||||
| @@ -79,63 +82,44 @@ models = [ | |||||||
|     {"name": "jina-v2-de",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", }, |     {"name": "jina-v2-de",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", }, | ||||||
| ] | ] | ||||||
|  |  | ||||||
| # make directory "models/tokenizers" if it doesn't exist |  | ||||||
| if not os.path.exists("models/tokenizers"): |  | ||||||
|     os.makedirs("models/tokenizers") |  | ||||||
|  |  | ||||||
|  |  | ||||||
| def download_file_with_auth(url, token, save_path): | def download_file_with_auth(url, token, save_path): | ||||||
|     headers = {"Authorization": f"Bearer {token}"} |     headers = {"Authorization": f"Bearer {token}"} | ||||||
|     response = requests.get(url, headers=headers) |     response = sess.get(url, headers=headers) | ||||||
|     if response.status_code == 200: |     response.raise_for_status() | ||||||
|         with open(save_path, 'wb') as f: |     os.makedirs(os.path.dirname(save_path), exist_ok=True) | ||||||
|             f.write(response.content) |     with open(save_path, 'wb') as f: | ||||||
|         logger.info(f"File {save_path} downloaded successfully") |         f.write(response.content) | ||||||
|     else: |     logger.info(f"File {save_path} downloaded successfully") | ||||||
|         logger.info(f"Failed to download file. Status code: {response.status_code}") |  | ||||||
|  |  | ||||||
|  |  | ||||||
| # download the tokenizer models | def download_model(model): | ||||||
| for model in models: |  | ||||||
|     name = model["name"] |     name = model["name"] | ||||||
|     repo = model["repo"] |     repo = model["repo"] | ||||||
|     tokt = model["tokt"] |     tokt = model["tokt"] | ||||||
|  |  | ||||||
|     if not os.path.exists(f"models/tokenizers/{name}"): |     os.makedirs(f"models/tokenizers/{name}", exist_ok=True) | ||||||
|         os.makedirs(f"models/tokenizers/{name}") |  | ||||||
|     else: |  | ||||||
|         logger.info(f"Directory models/tokenizers/{name} already exists - skipping") |  | ||||||
|         continue |  | ||||||
|  |  | ||||||
|     logger.info(f"Downloading {name} to models/tokenizers/{name}") |  | ||||||
|  |  | ||||||
|     url = f"{repo}/raw/main/config.json" |  | ||||||
|     save_path = f"models/tokenizers/{name}/config.json" |  | ||||||
|     download_file_with_auth(url, token, save_path) |  | ||||||
|  |  | ||||||
|     url = f"{repo}/raw/main/tokenizer.json" |  | ||||||
|     save_path = f"models/tokenizers/{name}/tokenizer.json" |  | ||||||
|     download_file_with_auth(url, token, save_path) |  | ||||||
|  |  | ||||||
|     # if downloaded file is less than 1KB, we likely need to download an LFS instead |  | ||||||
|     if os.path.getsize(save_path) < 1024: |  | ||||||
|         # remove the file |  | ||||||
|         os.remove(save_path) |  | ||||||
|         url = f"{repo}/resolve/main/tokenizer.json" |  | ||||||
|         save_path = f"models/tokenizers/{name}/tokenizer.json" |  | ||||||
|         download_file_with_auth(url, token, save_path) |  | ||||||
|  |  | ||||||
|  |     files = ["config.json", "tokenizer.json", "tokenizer_config.json"] | ||||||
|     if tokt == TOKENIZER_TYPE.SPM: |     if tokt == TOKENIZER_TYPE.SPM: | ||||||
|         url = f"{repo}/resolve/main/tokenizer.model" |         files.append("tokenizer.model") | ||||||
|         save_path = f"models/tokenizers/{name}/tokenizer.model" |  | ||||||
|         download_file_with_auth(url, token, save_path) |     for file in files: | ||||||
|  |         save_path = f"models/tokenizers/{name}/{file}" | ||||||
|  |         if os.path.isfile(save_path): | ||||||
|  |             logger.info(f"{name}: File {save_path} already exists - skipping") | ||||||
|  |             continue | ||||||
|  |         download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path) | ||||||
|  |  | ||||||
|  |  | ||||||
|  | for model in models: | ||||||
|  |     try: | ||||||
|  |         download_model(model) | ||||||
|  |     except Exception as e: | ||||||
|  |         logger.error(f"Failed to download model {model['name']}. Error: {e}") | ||||||
|  |  | ||||||
|     url = f"{repo}/raw/main/tokenizer_config.json" |  | ||||||
|     save_path = f"models/tokenizers/{name}/tokenizer_config.json" |  | ||||||
|     download_file_with_auth(url, token, save_path) |  | ||||||
|  |  | ||||||
| # generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function: | # generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function: | ||||||
| # TODO: auto-update convert-hf-to-gguf.py with the generated function |  | ||||||
|  |  | ||||||
| src_ifs = "" | src_ifs = "" | ||||||
| for model in models: | for model in models: | ||||||
| @@ -224,11 +208,18 @@ src_func = f""" | |||||||
|         return res |         return res | ||||||
| """ | """ | ||||||
|  |  | ||||||
| print(src_func) # noqa: NP100 | convert_py_pth = pathlib.Path("convert-hf-to-gguf.py") | ||||||
|  | convert_py = convert_py_pth.read_text() | ||||||
|  | convert_py = re.sub( | ||||||
|  |     r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)", | ||||||
|  |     lambda m: m.group(1) + src_func + m.group(3), | ||||||
|  |     convert_py, | ||||||
|  |     flags=re.DOTALL | re.MULTILINE, | ||||||
|  | ) | ||||||
|  |  | ||||||
| logger.info("\n") | convert_py_pth.write_text(convert_py) | ||||||
| logger.info("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!") |  | ||||||
| logger.info("\n") | logger.info("+++ convert-hf-to-gguf.py was updated") | ||||||
|  |  | ||||||
| # generate tests for each tokenizer model | # generate tests for each tokenizer model | ||||||
|  |  | ||||||
|   | |||||||
| @@ -402,6 +402,7 @@ class Model: | |||||||
|     # NOTE: this function is generated by convert-hf-to-gguf-update.py |     # NOTE: this function is generated by convert-hf-to-gguf-update.py | ||||||
|     #       do not modify it manually! |     #       do not modify it manually! | ||||||
|     # ref:  https://github.com/ggerganov/llama.cpp/pull/6920 |     # ref:  https://github.com/ggerganov/llama.cpp/pull/6920 | ||||||
|  |     # Marker: Start get_vocab_base_pre | ||||||
|     def get_vocab_base_pre(self, tokenizer) -> str: |     def get_vocab_base_pre(self, tokenizer) -> str: | ||||||
|         # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that |         # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that | ||||||
|         # is specific for the BPE pre-tokenizer used by the model |         # is specific for the BPE pre-tokenizer used by the model | ||||||
| @@ -489,6 +490,7 @@ class Model: | |||||||
|         logger.debug(f"chkhsh: {chkhsh}") |         logger.debug(f"chkhsh: {chkhsh}") | ||||||
|  |  | ||||||
|         return res |         return res | ||||||
|  |         # Marker: End get_vocab_base_pre | ||||||
|  |  | ||||||
|     def _set_vocab_gpt2(self) -> None: |     def _set_vocab_gpt2(self) -> None: | ||||||
|         tokens, toktypes, tokpre = self.get_vocab_base() |         tokens, toktypes, tokpre = self.get_vocab_base() | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Aarni Koskela
					Aarni Koskela