mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-10-27 08:21:30 +00:00
model : Granite docling + Idefics3 preprocessing (SmolVLM) (#16206)
* feat: Add granite-docling conversion using trillion pretokenizer Branch: gabe-l-hart/GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Add granite-docling vocab pre enum Branch: gabe-l-hart/GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix: Use granite-docling pre Branch: gabe-l-hart/GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Add clip_is_idefics3 Branch: gabe-l-hart/GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Allow multi-token boundary sequences for image templating Branch: gabe-l-hart/GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Add tiling support for idefices3 in clip.cpp This should likely be moved into llava_uhd::get_slice_instructions, but for now this avoids disrupting the logic there. Branch: gabe-l-hart/GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Partial support for full templating for idefics3 in mtmd There are still errors encoding some of the image chunks, but the token sequence now matches transformers _almost_ perfectly, except for the double newline before the global image which shows up as two consecutive newline tokens instead of a single double-newline token. I think this is happening because the blocks are tokenized separately then concatenated. Branch: gabe-l-hart/GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Fully working image preprocessing for idefics3 w/ resize and slicing Branch: gabe-l-hart/GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * feat: Parse the preprocessor config's longest side and add it to the mmproj hparams Branch: GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix: Use the longest side instead of size * scale_factor For Granite Docling, these come out to the same value, but that was just a conicidence. Branch: GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * fix: Allow batch encoding and remove clip_is_idefics3 Branch: GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * refactor: Remove unnecessary conditionals for empty token vectors Branch: GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * refactor: Use image_manipulation util Branch: GraniteDocling Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> * add test model --------- Signed-off-by: Gabe Goodhart <ghart@us.ibm.com> Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
This commit is contained in:
@@ -347,6 +347,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
||||
case LLAMA_VOCAB_PRE_TYPE_OLMO:
|
||||
case LLAMA_VOCAB_PRE_TYPE_JAIS:
|
||||
case LLAMA_VOCAB_PRE_TYPE_TRILLION:
|
||||
case LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING:
|
||||
regex_exprs = {
|
||||
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
||||
};
|
||||
@@ -1961,6 +1962,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
tokenizer_pre == "trillion") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_TRILLION;
|
||||
clean_spaces = false;
|
||||
} else if (
|
||||
tokenizer_pre == "granite-docling") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING;
|
||||
clean_spaces = false;
|
||||
} else if (
|
||||
tokenizer_pre == "bailingmoe" ||
|
||||
tokenizer_pre == "llada-moe") {
|
||||
|
||||
@@ -8,46 +8,47 @@
|
||||
|
||||
// pre-tokenization types
|
||||
enum llama_vocab_pre_type {
|
||||
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
|
||||
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
|
||||
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
|
||||
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
|
||||
LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
|
||||
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
|
||||
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
|
||||
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
|
||||
LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
|
||||
LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
|
||||
LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10,
|
||||
LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
|
||||
LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
|
||||
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
|
||||
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
|
||||
LLAMA_VOCAB_PRE_TYPE_PORO = 15,
|
||||
LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16,
|
||||
LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
|
||||
LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
|
||||
LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
|
||||
LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
|
||||
LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
|
||||
LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
|
||||
LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
|
||||
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
|
||||
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
|
||||
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
|
||||
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
|
||||
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
|
||||
LLAMA_VOCAB_PRE_TYPE_GPT4O = 29,
|
||||
LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30,
|
||||
LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
|
||||
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
|
||||
LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
|
||||
LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
|
||||
LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
|
||||
LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
|
||||
LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37,
|
||||
LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE = 38,
|
||||
LLAMA_VOCAB_PRE_TYPE_GROK_2 = 39,
|
||||
LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
|
||||
LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
|
||||
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
|
||||
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
|
||||
LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
|
||||
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
|
||||
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
|
||||
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
|
||||
LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
|
||||
LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
|
||||
LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10,
|
||||
LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
|
||||
LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
|
||||
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
|
||||
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
|
||||
LLAMA_VOCAB_PRE_TYPE_PORO = 15,
|
||||
LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16,
|
||||
LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
|
||||
LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
|
||||
LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
|
||||
LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
|
||||
LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
|
||||
LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
|
||||
LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
|
||||
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
|
||||
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
|
||||
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
|
||||
LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
|
||||
LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
|
||||
LLAMA_VOCAB_PRE_TYPE_GPT4O = 29,
|
||||
LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30,
|
||||
LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
|
||||
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
|
||||
LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
|
||||
LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
|
||||
LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
|
||||
LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
|
||||
LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37,
|
||||
LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE = 38,
|
||||
LLAMA_VOCAB_PRE_TYPE_GROK_2 = 39,
|
||||
LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING = 40,
|
||||
};
|
||||
|
||||
struct LLM_KV;
|
||||
|
||||
Reference in New Issue
Block a user