mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-04 09:32:00 +00:00 
			
		
		
		
	convert : add support for XLMRoberta embedding models (#8658)
* add conversion for bge-m3; small fix in unigram tokenizer * clean up and simplify XLMRoberta conversion
This commit is contained in:
		@@ -816,6 +816,9 @@ struct llm_tokenizer_ugm {
 | 
			
		||||
     * the best tokenization.
 | 
			
		||||
    */
 | 
			
		||||
    void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
 | 
			
		||||
        // get current size of output (for reversal later)
 | 
			
		||||
        size_t output_size = output.size();
 | 
			
		||||
 | 
			
		||||
        // normalize the input first
 | 
			
		||||
        std::string normalized;
 | 
			
		||||
        normalize(text, &normalized);
 | 
			
		||||
@@ -895,7 +898,7 @@ struct llm_tokenizer_ugm {
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // reverse the output since we added tokens starting from the end of the input
 | 
			
		||||
        std::reverse(output.begin(), output.end());
 | 
			
		||||
        std::reverse(output.begin() + output_size, output.end());
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
private:
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user