mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	*.py: Stylistic adjustments for python (#8233)
* Superflous parens in conditionals were removed. * Unused args in function were removed. * Replaced unused `idx` var with `_` * Initializing file_format and format_version attributes * Renaming constant to capitals * Preventing redefinition of the `f` var Signed-off-by: Jiri Podivin <jpodivin@redhat.com>
This commit is contained in:
		| @@ -737,7 +737,7 @@ class Model: | ||||
|                 added_tokens_json = json.load(f) | ||||
|                 for key in added_tokens_json: | ||||
|                     token_id = added_tokens_json[key] | ||||
|                     if (token_id >= vocab_size): | ||||
|                     if token_id >= vocab_size: | ||||
|                         logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') | ||||
|                         continue | ||||
|  | ||||
| @@ -2005,7 +2005,7 @@ class Phi3MiniModel(Model): | ||||
|  | ||||
|                 for key in added_tokens_json: | ||||
|                     token_id = added_tokens_json[key] | ||||
|                     if (token_id >= vocab_size): | ||||
|                     if token_id >= vocab_size: | ||||
|                         logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') | ||||
|                         continue | ||||
|  | ||||
| @@ -2081,7 +2081,7 @@ class Phi3MiniModel(Model): | ||||
|  | ||||
|         # write rope scaling for long context (128k) model | ||||
|         rope_scaling = self.find_hparam(['rope_scaling'], True) | ||||
|         if (rope_scaling is None): | ||||
|         if rope_scaling is None: | ||||
|             return | ||||
|  | ||||
|         scale = max_pos_embds / orig_max_pos_embds | ||||
| @@ -2728,7 +2728,7 @@ class JinaBertV2Model(BertModel): | ||||
|  | ||||
|             yield name, data | ||||
|  | ||||
|     def set_vocab(self, *args, **kwargs): | ||||
|     def set_vocab(self): | ||||
|         tokenizer_class = 'BertTokenizer' | ||||
|         with open(self.dir_model / "tokenizer_config.json", "r", encoding="utf-8") as f: | ||||
|             tokenizer_class = json.load(f)['tokenizer_class'] | ||||
| @@ -2876,7 +2876,7 @@ class ArcticModel(Model): | ||||
|                     added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"] | ||||
|                     for token_id, token_json in added_tokens_decoder.items(): | ||||
|                         token_id = int(token_id) | ||||
|                         if (token_id >= vocab_size): | ||||
|                         if token_id >= vocab_size: | ||||
|                             logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') | ||||
|                             continue | ||||
|  | ||||
| @@ -3125,7 +3125,7 @@ class T5Model(Model): | ||||
|                 added_tokens_json = json.load(f) | ||||
|                 for key in added_tokens_json: | ||||
|                     token_id = added_tokens_json[key] | ||||
|                     if (token_id >= vocab_size): | ||||
|                     if token_id >= vocab_size: | ||||
|                         logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') | ||||
|                         continue | ||||
|  | ||||
|   | ||||
| @@ -50,7 +50,7 @@ class TOKENIZER_TYPE(IntEnum): | ||||
|  | ||||
| # TODO: this string has to exercise as much pre-tokenizer functionality as possible | ||||
| #       will be updated with time - contributions welcome | ||||
| chktxt = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL' | ||||
| CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL' | ||||
|  | ||||
| if len(sys.argv) == 2: | ||||
|     token = sys.argv[1] | ||||
| @@ -100,8 +100,8 @@ def download_file_with_auth(url, token, save_path): | ||||
|     response = sess.get(url, headers=headers) | ||||
|     response.raise_for_status() | ||||
|     os.makedirs(os.path.dirname(save_path), exist_ok=True) | ||||
|     with open(save_path, 'wb') as f: | ||||
|         f.write(response.content) | ||||
|     with open(save_path, 'wb') as downloaded_file: | ||||
|         downloaded_file.write(response.content) | ||||
|     logger.info(f"File {save_path} downloaded successfully") | ||||
|  | ||||
|  | ||||
| @@ -160,7 +160,7 @@ for model in models: | ||||
|         logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}") | ||||
|         continue  # Skip to the next model if the tokenizer can't be loaded | ||||
|  | ||||
|     chktok = tokenizer.encode(chktxt) | ||||
|     chktok = tokenizer.encode(CHK_TXT) | ||||
|     chkhsh = sha256(str(chktok).encode()).hexdigest() | ||||
|  | ||||
|     logger.info(f"model: {name}") | ||||
| @@ -192,7 +192,7 @@ src_func = f""" | ||||
|         # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can | ||||
|         # use in llama.cpp to implement the same pre-tokenizer | ||||
|  | ||||
|         chktxt = {repr(chktxt)} | ||||
|         chktxt = {repr(CHK_TXT)} | ||||
|  | ||||
|         chktok = tokenizer.encode(chktxt) | ||||
|         chkhsh = sha256(str(chktok).encode()).hexdigest() | ||||
| @@ -288,7 +288,7 @@ tests = [ | ||||
|     "333333333", | ||||
|     "Cửa Việt", # llama-bpe fails on this | ||||
|     " discards", | ||||
|     chktxt, | ||||
|     CHK_TXT, | ||||
| ] | ||||
|  | ||||
| # write the tests to ./models/ggml-vocab-{name}.gguf.inp | ||||
|   | ||||
| @@ -132,6 +132,10 @@ class Tensor: | ||||
|  | ||||
|  | ||||
| class GGMLModel: | ||||
|  | ||||
|     file_format: GGMLFormat | ||||
|     format_version: int | ||||
|  | ||||
|     def __init__(self): | ||||
|         self.hyperparameters = None | ||||
|         self.vocab = None | ||||
| @@ -290,7 +294,7 @@ class GGMLToGGUF: | ||||
|         if self.vocab_override is not None: | ||||
|             vo = self.vocab_override | ||||
|             logger.info('* Adding vocab item(s)') | ||||
|             for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()): | ||||
|             for (_, (vbytes, score, ttype)) in enumerate(vo.all_tokens()): | ||||
|                 tokens.append(vbytes) | ||||
|                 scores.append(score) | ||||
|                 toktypes.append(ttype) | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Jiří Podivín
					Jiří Podivín