diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 0816762530..b44f0a3a28 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -171,7 +171,7 @@ struct clip_hparams { int32_t n_head; int32_t n_layer; // idefics3 - int32_t preproc_image_size = 0; + int32_t preproc_image_size = 0; // aka max_dimension int32_t proj_scale_factor = 0; float image_mean[3]; @@ -3221,8 +3221,8 @@ struct image_manipulation { return {0, 0}; } - float scale = std::min(1.0f, std::min(static_cast(max_dimension) / inp_size.width, - static_cast(max_dimension) / inp_size.height)); + float scale = std::min(static_cast(max_dimension) / inp_size.width, + static_cast(max_dimension) / inp_size.height); float target_width_f = static_cast(inp_size.width) * scale; float target_height_f = static_cast(inp_size.height) * scale; @@ -3385,7 +3385,7 @@ struct llava_uhd { // resize to overview size clip_image_u8_ptr resized_img(clip_image_u8_init()); - image_manipulation::bicubic_resize(*img, *resized_img, inst.overview_size.width, inst.overview_size.height); + image_manipulation::resize_and_pad_image(*img, *resized_img, inst.overview_size); output.push_back(std::move(resized_img)); if (inst.slices.empty()) { // no slices, just return the resized image @@ -3587,6 +3587,9 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str // CITE: https://github.com/huggingface/transformers/blob/main/src/transformers/models/idefics3/image_processing_idefics3.py#L737 const clip_image_size refined_size = image_manipulation::calc_size_preserved_ratio( original_size, params.image_size, params.preproc_image_size); + // LOG_INF("%s: original size: %d x %d, refined size: %d x %d\n", + // __func__, original_size.width, original_size.height, + // refined_size.width, refined_size.height); llava_uhd::slice_instructions instructions; instructions.overview_size = clip_image_size{params.image_size, params.image_size}; @@ -3597,6 +3600,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str }; for (int y = 0; y < refined_size.height; y += params.image_size) { for (int x = 0; x < refined_size.width; x += params.image_size) { + // LOG_INF("%s: adding slice at x=%d, y=%d\n", __func__, x, y); instructions.slices.push_back(llava_uhd::slice_coordinates{ /* x */x, /* y */y, diff --git a/tools/mtmd/tests.sh b/tools/mtmd/tests.sh index 5e33d12764..c227074636 100755 --- a/tools/mtmd/tests.sh +++ b/tools/mtmd/tests.sh @@ -139,7 +139,10 @@ for i in "${!arr_hf[@]}"; do echo "$output" > $SCRIPT_DIR/output/$bin-$(echo "$hf" | tr '/' '-').log - if echo "$output" | grep -iq "new york"; then + # either contains "new york" or both "men" and "walk" + if echo "$output" | grep -iq "new york" \ + || (echo "$output" | grep -iq "men" && echo "$output" | grep -iq "walk") + then result="$prefix \033[32mOK\033[0m: $bin $hf" else result="$prefix \033[31mFAIL\033[0m: $bin $hf"