mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-03 09:22:01 +00:00 
			
		
		
		
	vocab : BailingMoE : change possessive quantifiers to greedy (#12677)
This commit is contained in:
		@@ -411,7 +411,8 @@ struct llm_tokenizer_bpe : llm_tokenizer {
 | 
			
		||||
                regex_exprs = {
 | 
			
		||||
                    // original regex from tokenizer.json
 | 
			
		||||
                    // "'(?i:[sdmt]|ll|ve|re)|[^\\r\\n\\p{L}\\p{N}]?+\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]++[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+"
 | 
			
		||||
                    "'(?:[sSdDmMtT]|[lL][lL]|[vV][eE]|[rR][eE])|[^\\r\\n\\p{L}\\p{N}]?+\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]++[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
 | 
			
		||||
                    // FIXME? Changed possessive quantifiers (?+ and ++) to greedy to avoid errors and imatrix hanging (tried atomic grouping but it's not supported?)
 | 
			
		||||
                    "'(?:[sSdDmMtT]|[lL][lL]|[vV][eE]|[rR][eE])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
 | 
			
		||||
                };
 | 
			
		||||
                break;
 | 
			
		||||
            default:
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user