mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-31 08:51:55 +00:00 
			
		
		
		
	Minor improvements in GPT2 tokenizer (#3567)
* Fixing minor bugs in bpe_gpt2_preprocess * Don't add bos token in test
This commit is contained in:
		| @@ -6342,7 +6342,6 @@ private: | ||||
|         for (int i = 0; i < (int)text_utf.size(); i++) { | ||||
|             const std::string & utf_char = text_utf[i]; | ||||
|             bool split_condition = false; | ||||
|             // const char* text_pos = raw_text_p + utf_char.seq_offset_bytes; | ||||
|             int bytes_remain = text_utf.size() - i; | ||||
|             // forward backward lookups | ||||
|             const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : ""; | ||||
| @@ -6368,9 +6367,9 @@ private: | ||||
|             if (!split_condition && bytes_remain >= 3) { | ||||
|                 // 're|'ve|'ll | ||||
|                 if (utf_char == "\'" && ( | ||||
|                     (utf_char_next == "r" || utf_char_next_next == "e") || | ||||
|                     (utf_char_next == "v" || utf_char_next_next == "e") || | ||||
|                     (utf_char_next == "l" || utf_char_next_next == "l")) | ||||
|                     (utf_char_next == "r" && utf_char_next_next == "e") || | ||||
|                     (utf_char_next == "v" && utf_char_next_next == "e") || | ||||
|                     (utf_char_next == "l" && utf_char_next_next == "l")) | ||||
|                     ) { | ||||
|                     split_condition = true; | ||||
|                 } | ||||
| @@ -6421,7 +6420,7 @@ private: | ||||
|                 else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) { | ||||
|                     split_condition = true; | ||||
|                 } | ||||
|                 else if (collecting_whitespace_lookahead && codepoint_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE) { | ||||
|                 else if (collecting_whitespace_lookahead && (codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) { | ||||
|                     split_condition = true; | ||||
|                 } | ||||
|             } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 goerch
					goerch