mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	server : support audio input (#13714)
* server : support audio input * add audio support on webui
This commit is contained in:
		| @@ -12,17 +12,7 @@ size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks) { | |||||||
|     size_t n_tokens = 0; |     size_t n_tokens = 0; | ||||||
|     for (size_t i = 0; i < mtmd_input_chunks_size(chunks); i++) { |     for (size_t i = 0; i < mtmd_input_chunks_size(chunks); i++) { | ||||||
|         auto chunk = mtmd_input_chunks_get(chunks, i); |         auto chunk = mtmd_input_chunks_get(chunks, i); | ||||||
|         auto chunk_type = mtmd_input_chunk_get_type(chunk); |         n_tokens += mtmd_input_chunk_get_n_tokens(chunk); | ||||||
|         if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) { |  | ||||||
|             size_t n_tokens_text; |  | ||||||
|             mtmd_input_chunk_get_tokens_text(chunk, &n_tokens_text); |  | ||||||
|             n_tokens += n_tokens_text; |  | ||||||
|         } else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) { |  | ||||||
|             auto tokens_image = mtmd_input_chunk_get_tokens_image(chunk); |  | ||||||
|             n_tokens += mtmd_image_tokens_get_n_tokens(tokens_image); |  | ||||||
|         } else { |  | ||||||
|             GGML_ASSERT(false && "chunk type not supported"); |  | ||||||
|         } |  | ||||||
|     } |     } | ||||||
|     return n_tokens; |     return n_tokens; | ||||||
| } | } | ||||||
| @@ -31,17 +21,7 @@ llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks) { | |||||||
|     llama_pos n_pos = 0; |     llama_pos n_pos = 0; | ||||||
|     for (size_t i = 0; i < mtmd_input_chunks_size(chunks); i++) { |     for (size_t i = 0; i < mtmd_input_chunks_size(chunks); i++) { | ||||||
|         auto chunk = mtmd_input_chunks_get(chunks, i); |         auto chunk = mtmd_input_chunks_get(chunks, i); | ||||||
|         auto chunk_type = mtmd_input_chunk_get_type(chunk); |         n_pos += mtmd_input_chunk_get_n_pos(chunk); | ||||||
|         if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) { |  | ||||||
|             size_t n_tokens_text; |  | ||||||
|             mtmd_input_chunk_get_tokens_text(chunk, &n_tokens_text); |  | ||||||
|             n_pos += n_tokens_text; |  | ||||||
|         } else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) { |  | ||||||
|             auto tokens_image = mtmd_input_chunk_get_tokens_image(chunk); |  | ||||||
|             n_pos += mtmd_image_tokens_get_n_pos(tokens_image); |  | ||||||
|         } else { |  | ||||||
|             GGML_ASSERT(false && "chunk type not supported"); |  | ||||||
|         } |  | ||||||
|     } |     } | ||||||
|     return n_pos; |     return n_pos; | ||||||
| } | } | ||||||
|   | |||||||
| @@ -751,6 +751,10 @@ const unsigned char * mtmd_bitmap_get_data(const mtmd_bitmap * bitmap) { | |||||||
|     return bitmap->data.data(); |     return bitmap->data.data(); | ||||||
| } | } | ||||||
|  |  | ||||||
|  | size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap) { | ||||||
|  |     return bitmap->data.size(); | ||||||
|  | } | ||||||
|  |  | ||||||
| bool mtmd_bitmap_is_audio(const mtmd_bitmap * bitmap) { | bool mtmd_bitmap_is_audio(const mtmd_bitmap * bitmap) { | ||||||
|     return bitmap->is_audio; |     return bitmap->is_audio; | ||||||
| } | } | ||||||
|   | |||||||
| @@ -122,6 +122,7 @@ MTMD_API mtmd_bitmap *         mtmd_bitmap_init_from_audio(size_t n_samples, | |||||||
| MTMD_API uint32_t              mtmd_bitmap_get_nx     (const mtmd_bitmap * bitmap); | MTMD_API uint32_t              mtmd_bitmap_get_nx     (const mtmd_bitmap * bitmap); | ||||||
| MTMD_API uint32_t              mtmd_bitmap_get_ny     (const mtmd_bitmap * bitmap); | MTMD_API uint32_t              mtmd_bitmap_get_ny     (const mtmd_bitmap * bitmap); | ||||||
| MTMD_API const unsigned char * mtmd_bitmap_get_data   (const mtmd_bitmap * bitmap); | MTMD_API const unsigned char * mtmd_bitmap_get_data   (const mtmd_bitmap * bitmap); | ||||||
|  | MTMD_API size_t                mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap); | ||||||
| MTMD_API bool                  mtmd_bitmap_is_audio   (const mtmd_bitmap * bitmap); | MTMD_API bool                  mtmd_bitmap_is_audio   (const mtmd_bitmap * bitmap); | ||||||
| MTMD_API void                  mtmd_bitmap_free       (mtmd_bitmap * bitmap); | MTMD_API void                  mtmd_bitmap_free       (mtmd_bitmap * bitmap); | ||||||
| // bitmap ID is optional, but useful for KV cache tracking | // bitmap ID is optional, but useful for KV cache tracking | ||||||
| @@ -322,6 +323,7 @@ struct bitmap { | |||||||
|     uint32_t nx() { return mtmd_bitmap_get_nx(ptr.get()); } |     uint32_t nx() { return mtmd_bitmap_get_nx(ptr.get()); } | ||||||
|     uint32_t ny() { return mtmd_bitmap_get_ny(ptr.get()); } |     uint32_t ny() { return mtmd_bitmap_get_ny(ptr.get()); } | ||||||
|     const unsigned char * data() { return mtmd_bitmap_get_data(ptr.get()); } |     const unsigned char * data() { return mtmd_bitmap_get_data(ptr.get()); } | ||||||
|  |     size_t n_bytes() { return mtmd_bitmap_get_n_bytes(ptr.get()); } | ||||||
|     std::string id() { return mtmd_bitmap_get_id(ptr.get()); } |     std::string id() { return mtmd_bitmap_get_id(ptr.get()); } | ||||||
|     void set_id(const char * id) { mtmd_bitmap_set_id(ptr.get(), id); } |     void set_id(const char * id) { mtmd_bitmap_set_id(ptr.get(), id); } | ||||||
| }; | }; | ||||||
|   | |||||||
										
											Binary file not shown.
										
									
								
							| @@ -1891,6 +1891,7 @@ struct server_context { | |||||||
|     float slot_prompt_similarity = 0.0f; |     float slot_prompt_similarity = 0.0f; | ||||||
|  |  | ||||||
|     common_chat_templates_ptr chat_templates; |     common_chat_templates_ptr chat_templates; | ||||||
|  |     oaicompat_parser_options  oai_parser_opt; | ||||||
|  |  | ||||||
|     ~server_context() { |     ~server_context() { | ||||||
|         mtmd_free(mctx); |         mtmd_free(mctx); | ||||||
| @@ -2086,6 +2087,15 @@ struct server_context { | |||||||
|         } |         } | ||||||
|  |  | ||||||
|         metrics.init(); |         metrics.init(); | ||||||
|  |  | ||||||
|  |         oai_parser_opt = { | ||||||
|  |             /* use_jinja             */ params_base.use_jinja, | ||||||
|  |             /* prefill_assistant     */ params_base.prefill_assistant, | ||||||
|  |             /* reasoning_format      */ params_base.reasoning_format, | ||||||
|  |             /* common_chat_templates */ chat_templates.get(), | ||||||
|  |             /* allow_image           */ mctx ? mtmd_support_vision(mctx) : false, | ||||||
|  |             /* allow_audio           */ mctx ? mtmd_support_audio (mctx) : false, | ||||||
|  |         }; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     server_slot * get_slot_by_id(int id) { |     server_slot * get_slot_by_id(int id) { | ||||||
| @@ -4092,7 +4102,10 @@ int main(int argc, char ** argv) { | |||||||
|             { "default_generation_settings", ctx_server.default_generation_settings_for_props }, |             { "default_generation_settings", ctx_server.default_generation_settings_for_props }, | ||||||
|             { "total_slots",                 ctx_server.params_base.n_parallel }, |             { "total_slots",                 ctx_server.params_base.n_parallel }, | ||||||
|             { "model_path",                  ctx_server.params_base.model.path }, |             { "model_path",                  ctx_server.params_base.model.path }, | ||||||
|             { "modalities",                  json{{"vision", ctx_server.mctx != nullptr}} }, // TODO: add more in the future |             { "modalities",                  json{ | ||||||
|  |                 {"vision", ctx_server.oai_parser_opt.allow_image}, | ||||||
|  |                 {"audio",  ctx_server.oai_parser_opt.allow_audio}, | ||||||
|  |             } }, | ||||||
|             { "chat_template",               common_chat_templates_source(ctx_server.chat_templates.get()) }, |             { "chat_template",               common_chat_templates_source(ctx_server.chat_templates.get()) }, | ||||||
|             { "bos_token",                   common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)}, |             { "bos_token",                   common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)}, | ||||||
|             { "eos_token",                   common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)}, |             { "eos_token",                   common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)}, | ||||||
| @@ -4183,10 +4196,10 @@ int main(int argc, char ** argv) { | |||||||
|                 for (auto & file : files) { |                 for (auto & file : files) { | ||||||
|                     mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(file.data(), file.size())); |                     mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(file.data(), file.size())); | ||||||
|                     if (!bmp.ptr) { |                     if (!bmp.ptr) { | ||||||
|                         throw std::runtime_error("Failed to load image"); |                         throw std::runtime_error("Failed to load image or audio file"); | ||||||
|                     } |                     } | ||||||
|                     // calculate bitmap hash (for KV caching) |                     // calculate bitmap hash (for KV caching) | ||||||
|                     std::string hash = fnv_hash(bmp.data(), bmp.nx()*bmp.ny()*3); |                     std::string hash = fnv_hash(bmp.data(), bmp.n_bytes()); | ||||||
|                     bmp.set_id(hash.c_str()); |                     bmp.set_id(hash.c_str()); | ||||||
|                     bitmaps.entries.push_back(std::move(bmp)); |                     bitmaps.entries.push_back(std::move(bmp)); | ||||||
|                 } |                 } | ||||||
| @@ -4418,7 +4431,7 @@ int main(int argc, char ** argv) { | |||||||
|             OAICOMPAT_TYPE_NONE); // infill is not OAI compatible |             OAICOMPAT_TYPE_NONE); // infill is not OAI compatible | ||||||
|     }; |     }; | ||||||
|  |  | ||||||
|     const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) { |     const auto handle_chat_completions = [&ctx_server, &res_error, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) { | ||||||
|         LOG_DBG("request: %s\n", req.body.c_str()); |         LOG_DBG("request: %s\n", req.body.c_str()); | ||||||
|         if (ctx_server.params_base.embedding) { |         if (ctx_server.params_base.embedding) { | ||||||
|             res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED)); |             res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED)); | ||||||
| @@ -4427,13 +4440,9 @@ int main(int argc, char ** argv) { | |||||||
|  |  | ||||||
|         auto body = json::parse(req.body); |         auto body = json::parse(req.body); | ||||||
|         std::vector<raw_buffer> files; |         std::vector<raw_buffer> files; | ||||||
|         json data = oaicompat_completion_params_parse( |         json data = oaicompat_chat_params_parse( | ||||||
|             body, |             body, | ||||||
|             params.use_jinja, |             ctx_server.oai_parser_opt, | ||||||
|             params.prefill_assistant, |  | ||||||
|             params.reasoning_format, |  | ||||||
|             ctx_server.chat_templates.get(), |  | ||||||
|             ctx_server.mctx, |  | ||||||
|             files); |             files); | ||||||
|  |  | ||||||
|         handle_completions_impl( |         handle_completions_impl( | ||||||
| @@ -4446,16 +4455,12 @@ int main(int argc, char ** argv) { | |||||||
|     }; |     }; | ||||||
|  |  | ||||||
|     // same with handle_chat_completions, but without inference part |     // same with handle_chat_completions, but without inference part | ||||||
|     const auto handle_apply_template = [&ctx_server, ¶ms, &res_ok](const httplib::Request & req, httplib::Response & res) { |     const auto handle_apply_template = [&ctx_server, &res_ok](const httplib::Request & req, httplib::Response & res) { | ||||||
|         auto body = json::parse(req.body); |         auto body = json::parse(req.body); | ||||||
|         std::vector<raw_buffer> files; // dummy, unused |         std::vector<raw_buffer> files; // dummy, unused | ||||||
|         json data = oaicompat_completion_params_parse( |         json data = oaicompat_chat_params_parse( | ||||||
|             body, |             body, | ||||||
|             params.use_jinja, |             ctx_server.oai_parser_opt, | ||||||
|             params.prefill_assistant, |  | ||||||
|             params.reasoning_format, |  | ||||||
|             ctx_server.chat_templates.get(), |  | ||||||
|             ctx_server.mctx, |  | ||||||
|             files); |             files); | ||||||
|         res_ok(res, {{ "prompt", std::move(data.at("prompt")) }}); |         res_ok(res, {{ "prompt", std::move(data.at("prompt")) }}); | ||||||
|     }; |     }; | ||||||
|   | |||||||
| @@ -30,6 +30,7 @@ def create_server(): | |||||||
|         ("What is this:\n", "malformed",              False, None), |         ("What is this:\n", "malformed",              False, None), | ||||||
|         ("What is this:\n", "https://google.com/404", False, None), # non-existent image |         ("What is this:\n", "https://google.com/404", False, None), # non-existent image | ||||||
|         ("What is this:\n", "https://ggml.ai",        False, None), # non-image data |         ("What is this:\n", "https://ggml.ai",        False, None), # non-image data | ||||||
|  |         # TODO @ngxson : test with multiple images, no images and with audio | ||||||
|     ] |     ] | ||||||
| ) | ) | ||||||
| def test_vision_chat_completion(prompt, image_url, success, re_content): | def test_vision_chat_completion(prompt, image_url, success, re_content): | ||||||
|   | |||||||
| @@ -536,6 +536,7 @@ static bool server_sent_event(httplib::DataSink & sink, const char * event, cons | |||||||
| // OAI utils | // OAI utils | ||||||
| // | // | ||||||
|  |  | ||||||
|  | // used by /completions endpoint | ||||||
| static json oaicompat_completion_params_parse(const json & body) { | static json oaicompat_completion_params_parse(const json & body) { | ||||||
|     json llama_params; |     json llama_params; | ||||||
|  |  | ||||||
| @@ -580,13 +581,19 @@ static json oaicompat_completion_params_parse(const json & body) { | |||||||
|     return llama_params; |     return llama_params; | ||||||
| } | } | ||||||
|  |  | ||||||
| static json oaicompat_completion_params_parse( | struct oaicompat_parser_options { | ||||||
|  |     bool use_jinja; | ||||||
|  |     bool prefill_assistant; | ||||||
|  |     common_reasoning_format reasoning_format; | ||||||
|  |     common_chat_templates * tmpls; | ||||||
|  |     bool allow_image; | ||||||
|  |     bool allow_audio; | ||||||
|  | }; | ||||||
|  |  | ||||||
|  | // used by /chat/completions endpoint | ||||||
|  | static json oaicompat_chat_params_parse( | ||||||
|     const json & body, /* openai api json semantics */ |     const json & body, /* openai api json semantics */ | ||||||
|     bool use_jinja, |     const oaicompat_parser_options & opt, | ||||||
|     bool prefill_assistant, |  | ||||||
|     common_reasoning_format reasoning_format, |  | ||||||
|     const struct common_chat_templates * tmpls, |  | ||||||
|     bool allow_non_text, |  | ||||||
|     std::vector<raw_buffer> & out_files) |     std::vector<raw_buffer> & out_files) | ||||||
| { | { | ||||||
|     json llama_params; |     json llama_params; | ||||||
| @@ -598,11 +605,11 @@ static json oaicompat_completion_params_parse( | |||||||
|         if (stream) { |         if (stream) { | ||||||
|             throw std::runtime_error("Cannot use tools with stream"); |             throw std::runtime_error("Cannot use tools with stream"); | ||||||
|         } |         } | ||||||
|         if (!use_jinja) { |         if (!opt.use_jinja) { | ||||||
|             throw std::runtime_error("tools param requires --jinja flag"); |             throw std::runtime_error("tools param requires --jinja flag"); | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|     if (!use_jinja) { |     if (!opt.use_jinja) { | ||||||
|         if (body.contains("tool_choice") && !body.at("tool_choice").is_null()) { |         if (body.contains("tool_choice") && !body.at("tool_choice").is_null()) { | ||||||
|             throw std::runtime_error("Unsupported param: tool_choice"); |             throw std::runtime_error("Unsupported param: tool_choice"); | ||||||
|         } |         } | ||||||
| @@ -667,12 +674,12 @@ static json oaicompat_completion_params_parse( | |||||||
|  |  | ||||||
|         for (auto & p : content) { |         for (auto & p : content) { | ||||||
|             std::string type      = json_value(p, "type", std::string()); |             std::string type      = json_value(p, "type", std::string()); | ||||||
|             json        image_url = json_value(p, "image_url", json::object()); |  | ||||||
|             if (type == "image_url") { |             if (type == "image_url") { | ||||||
|                 if (!allow_non_text) { |                 if (!opt.allow_image) { | ||||||
|                     throw std::runtime_error("image input is not supported by this server"); |                     throw std::runtime_error("image input is not supported - hint: if this is unexpected, you may need to provide the mmproj"); | ||||||
|                 } |                 } | ||||||
|  |  | ||||||
|  |                 json image_url  = json_value(p, "image_url", json::object()); | ||||||
|                 std::string url = json_value(image_url, "url", std::string()); |                 std::string url = json_value(image_url, "url", std::string()); | ||||||
|                 if (string_starts_with(url, "http")) { |                 if (string_starts_with(url, "http")) { | ||||||
|                     // download remote image |                     // download remote image | ||||||
| @@ -712,6 +719,29 @@ static json oaicompat_completion_params_parse( | |||||||
|                 p["type"] = "text"; |                 p["type"] = "text"; | ||||||
|                 p["text"] = mtmd_default_marker(); |                 p["text"] = mtmd_default_marker(); | ||||||
|                 p.erase("image_url"); |                 p.erase("image_url"); | ||||||
|  |  | ||||||
|  |             } else if (type == "input_audio") { | ||||||
|  |                 if (!opt.allow_audio) { | ||||||
|  |                     throw std::runtime_error("audio input is not supported - hint: if this is unexpected, you may need to provide the mmproj"); | ||||||
|  |                 } | ||||||
|  |  | ||||||
|  |                 json input_audio   = json_value(p, "input_audio", json::object()); | ||||||
|  |                 std::string data   = json_value(input_audio, "data", std::string()); | ||||||
|  |                 std::string format = json_value(input_audio, "format", std::string()); | ||||||
|  |                 // while we also support flac, we don't allow it here so we matches the OAI spec | ||||||
|  |                 if (format != "wav" && format != "mp3") { | ||||||
|  |                     throw std::runtime_error("input_audio.format must be either 'wav' or 'mp3'"); | ||||||
|  |                 } | ||||||
|  |                 auto decoded_data = base64_decode(data); // expected to be base64 encoded | ||||||
|  |                 out_files.push_back(decoded_data); | ||||||
|  |  | ||||||
|  |                 // replace this chunk with a marker | ||||||
|  |                 p["type"] = "text"; | ||||||
|  |                 p["text"] = mtmd_default_marker(); | ||||||
|  |                 p.erase("input_audio"); | ||||||
|  |  | ||||||
|  |             } else if (type != "text") { | ||||||
|  |                 throw std::runtime_error("unsupported content[].type"); | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| @@ -723,9 +753,9 @@ static json oaicompat_completion_params_parse( | |||||||
|     inputs.json_schema           = json_schema.is_null() ? "" : json_schema.dump(); |     inputs.json_schema           = json_schema.is_null() ? "" : json_schema.dump(); | ||||||
|     inputs.grammar               = grammar; |     inputs.grammar               = grammar; | ||||||
|     inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true); |     inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true); | ||||||
|     inputs.use_jinja             = use_jinja; |     inputs.use_jinja             = opt.use_jinja; | ||||||
|     inputs.parallel_tool_calls   = json_value(body, "parallel_tool_calls", false); |     inputs.parallel_tool_calls   = json_value(body, "parallel_tool_calls", false); | ||||||
|     inputs.extract_reasoning     = reasoning_format != COMMON_REASONING_FORMAT_NONE; |     inputs.extract_reasoning     = opt.reasoning_format != COMMON_REASONING_FORMAT_NONE; | ||||||
|     inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true); |     inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true); | ||||||
|     if (!inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && body.contains("grammar")) { |     if (!inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && body.contains("grammar")) { | ||||||
|         throw std::runtime_error("Cannot use custom grammar constraints with tools."); |         throw std::runtime_error("Cannot use custom grammar constraints with tools."); | ||||||
| @@ -733,7 +763,7 @@ static json oaicompat_completion_params_parse( | |||||||
|  |  | ||||||
|     // if the assistant message appears at the end of list, we do not add end-of-turn token |     // if the assistant message appears at the end of list, we do not add end-of-turn token | ||||||
|     // for ex. this can be useful to modify the reasoning process in reasoning models |     // for ex. this can be useful to modify the reasoning process in reasoning models | ||||||
|     bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant" && prefill_assistant; |     bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant" && opt.prefill_assistant; | ||||||
|     common_chat_msg last_message; |     common_chat_msg last_message; | ||||||
|     if (prefill_assistant_message) { |     if (prefill_assistant_message) { | ||||||
|         last_message = inputs.messages.back(); |         last_message = inputs.messages.back(); | ||||||
| @@ -749,7 +779,7 @@ static json oaicompat_completion_params_parse( | |||||||
|     } |     } | ||||||
|  |  | ||||||
|     // Apply chat template to the list of messages |     // Apply chat template to the list of messages | ||||||
|     auto chat_params = common_chat_templates_apply(tmpls, inputs); |     auto chat_params = common_chat_templates_apply(opt.tmpls, inputs); | ||||||
|  |  | ||||||
|     /* Append assistant prefilled message */ |     /* Append assistant prefilled message */ | ||||||
|     if (prefill_assistant_message) { |     if (prefill_assistant_message) { | ||||||
| @@ -1040,7 +1070,7 @@ struct server_tokens { | |||||||
| private: // disallow accessing these members directly, risking out-of-sync | private: // disallow accessing these members directly, risking out-of-sync | ||||||
|  |  | ||||||
|     // map a **start** position in tokens to the image chunk |     // map a **start** position in tokens to the image chunk | ||||||
|     std::unordered_map<llama_pos, mtmd::input_chunk_ptr> map_pos_to_image; |     std::unordered_map<llama_pos, mtmd::input_chunk_ptr> map_pos_to_media; | ||||||
|  |  | ||||||
|     // list of tokens |     // list of tokens | ||||||
|     // it can include LLAMA_TOKEN_NULL, which is used to indicate a token that is not a text token |     // it can include LLAMA_TOKEN_NULL, which is used to indicate a token that is not a text token | ||||||
| @@ -1051,7 +1081,7 @@ private: // disallow accessing these members directly, risking out-of-sync | |||||||
|     // for ex. with input of 5 text tokens and 2 images: |     // for ex. with input of 5 text tokens and 2 images: | ||||||
|     //      [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1] |     //      [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1] | ||||||
|     // pos  0   1   2   3   4   5      6      7      8      9 |     // pos  0   1   2   3   4   5      6      7      8      9 | ||||||
|     // map_pos_to_image will contain: {5, img0}, {8, img1} |     // map_pos_to_media will contain: {5, img0}, {8, img1} | ||||||
|  |  | ||||||
| public: | public: | ||||||
|     server_tokens() = default; |     server_tokens() = default; | ||||||
| @@ -1090,15 +1120,15 @@ public: | |||||||
|         } |         } | ||||||
|         oss << "\n"; |         oss << "\n"; | ||||||
|         oss << "image pos: "; |         oss << "image pos: "; | ||||||
|         for (const auto & it : map_pos_to_image) { |         for (const auto & it : map_pos_to_media) { | ||||||
|             oss << it.first << ", "; |             oss << it.first << ", "; | ||||||
|         } |         } | ||||||
|         return oss.str(); |         return oss.str(); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     const mtmd::input_chunk_ptr & find_chunk(llama_pos pos) const { |     const mtmd::input_chunk_ptr & find_chunk(llama_pos pos) const { | ||||||
|         auto it = map_pos_to_image.find(pos); |         auto it = map_pos_to_media.find(pos); | ||||||
|         if (it != map_pos_to_image.end()) { |         if (it != map_pos_to_media.end()) { | ||||||
|             return it->second; |             return it->second; | ||||||
|         } else { |         } else { | ||||||
|             throw std::runtime_error("Chunk not found"); |             throw std::runtime_error("Chunk not found"); | ||||||
| @@ -1115,16 +1145,15 @@ public: | |||||||
|     // will create a copy of the chunk if it contains non-text data |     // will create a copy of the chunk if it contains non-text data | ||||||
|     void push_back(const mtmd_input_chunk * chunk) { |     void push_back(const mtmd_input_chunk * chunk) { | ||||||
|         auto type = mtmd_input_chunk_get_type(chunk); |         auto type = mtmd_input_chunk_get_type(chunk); | ||||||
|         if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE) { |         if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE || type == MTMD_INPUT_CHUNK_TYPE_AUDIO) { | ||||||
|             GGML_ASSERT(has_mtmd); |             GGML_ASSERT(has_mtmd); | ||||||
|             auto img_tokens = mtmd_input_chunk_get_tokens_image(chunk); |             const int n_pos = mtmd_input_chunk_get_n_pos(chunk); | ||||||
|             const int n_pos = mtmd_image_tokens_get_n_pos(img_tokens); |  | ||||||
|             llama_pos start_pos = tokens.size(); |             llama_pos start_pos = tokens.size(); | ||||||
|             for (int i = 0; i < n_pos; ++i) { |             for (int i = 0; i < n_pos; ++i) { | ||||||
|                 tokens.emplace_back(LLAMA_TOKEN_NULL); |                 tokens.emplace_back(LLAMA_TOKEN_NULL); | ||||||
|             } |             } | ||||||
|             mtmd::input_chunk_ptr new_chunk(mtmd_input_chunk_copy(chunk)); |             mtmd::input_chunk_ptr new_chunk(mtmd_input_chunk_copy(chunk)); | ||||||
|             map_pos_to_image[start_pos] = std::move(new_chunk); |             map_pos_to_media[start_pos] = std::move(new_chunk); | ||||||
|         } else if (type == MTMD_INPUT_CHUNK_TYPE_TEXT) { |         } else if (type == MTMD_INPUT_CHUNK_TYPE_TEXT) { | ||||||
|             size_t n_tokens; |             size_t n_tokens; | ||||||
|             auto text_tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens); |             auto text_tokens = mtmd_input_chunk_get_tokens_text(chunk, &n_tokens); | ||||||
| @@ -1169,6 +1198,9 @@ public: | |||||||
|     void keep_first(size_t n) { |     void keep_first(size_t n) { | ||||||
|         GGML_ASSERT(n <= tokens.size()); |         GGML_ASSERT(n <= tokens.size()); | ||||||
|         if (has_mtmd) { |         if (has_mtmd) { | ||||||
|  |             if (n == tokens.size()) { | ||||||
|  |                 return; // nothing to do | ||||||
|  |             } | ||||||
|             // we throw an error if we try to remove a token in the middle of an image |             // we throw an error if we try to remove a token in the middle of an image | ||||||
|             // for ex. with input of 5 text tokens and 2 images: |             // for ex. with input of 5 text tokens and 2 images: | ||||||
|             //    [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1] |             //    [0] [1] [2] [3] [4] [img0] [img0] [img0] [img1] [img1] | ||||||
| @@ -1183,10 +1215,10 @@ public: | |||||||
|                 } |                 } | ||||||
|             } |             } | ||||||
|             // remove all image chunks that are not used anymore |             // remove all image chunks that are not used anymore | ||||||
|             for (auto it = map_pos_to_image.begin(); it != map_pos_to_image.end(); ) { |             for (auto it = map_pos_to_media.begin(); it != map_pos_to_media.end(); ) { | ||||||
|                 llama_pos pos = it->first; |                 llama_pos pos = it->first; | ||||||
|                 if (pos >= (llama_pos)n) { |                 if (pos >= (llama_pos)n) { | ||||||
|                     it = map_pos_to_image.erase(it); |                     it = map_pos_to_media.erase(it); | ||||||
|                 } else { |                 } else { | ||||||
|                     ++it; |                     ++it; | ||||||
|                 } |                 } | ||||||
| @@ -1217,14 +1249,12 @@ public: | |||||||
|                 const auto & a_chunk =   find_chunk(i); |                 const auto & a_chunk =   find_chunk(i); | ||||||
|                 const auto & b_chunk = b.find_chunk(i); |                 const auto & b_chunk = b.find_chunk(i); | ||||||
|                 GGML_ASSERT(a_chunk && b_chunk); |                 GGML_ASSERT(a_chunk && b_chunk); | ||||||
|                 const auto * a_img = mtmd_input_chunk_get_tokens_image(a_chunk.get()); |                 std::string ai_id  = mtmd_input_chunk_get_id(a_chunk.get()); | ||||||
|                 const auto * b_img = mtmd_input_chunk_get_tokens_image(b_chunk.get()); |                 std::string bi_id  = mtmd_input_chunk_get_id(b_chunk.get()); | ||||||
|                 std::string ai_id  = mtmd_image_tokens_get_id(a_img); |                 size_t a_pos       = mtmd_input_chunk_get_n_pos(a_chunk.get()); | ||||||
|                 std::string bi_id  = mtmd_image_tokens_get_id(b_img); |                 size_t b_pos       = mtmd_input_chunk_get_n_pos(b_chunk.get()); | ||||||
|                 size_t a_pos       = mtmd_image_tokens_get_n_pos(a_img); |  | ||||||
|                 size_t b_pos       = mtmd_image_tokens_get_n_pos(b_img); |  | ||||||
|                 if (ai_id == bi_id && a_pos == b_pos) { |                 if (ai_id == bi_id && a_pos == b_pos) { | ||||||
|                     GGML_ASSERT(a_pos > 0 && "Invalid image token"); // should never happen |                     GGML_ASSERT(a_pos > 0 && "Invalid media chunk"); // should never happen | ||||||
|                     i += a_pos - 1; // will be +1 by the for loop |                     i += a_pos - 1; // will be +1 by the for loop | ||||||
|                     continue; |                     continue; | ||||||
|                 } else { |                 } else { | ||||||
| @@ -1250,8 +1280,7 @@ public: | |||||||
|             if (t == LLAMA_TOKEN_NULL) { |             if (t == LLAMA_TOKEN_NULL) { | ||||||
|                 try { |                 try { | ||||||
|                     const auto & chunk = find_chunk(i); |                     const auto & chunk = find_chunk(i); | ||||||
|                     const auto * img_tokens = mtmd_input_chunk_get_tokens_image(chunk.get()); |                     size_t n_pos = mtmd_input_chunk_get_n_pos(chunk.get()); | ||||||
|                     size_t n_pos = mtmd_image_tokens_get_n_pos(img_tokens); |  | ||||||
|                     i += n_pos - 1; // will be +1 by the for loop |                     i += n_pos - 1; // will be +1 by the for loop | ||||||
|                 } catch (const std::exception & e) { |                 } catch (const std::exception & e) { | ||||||
|                     return false; |                     return false; | ||||||
| @@ -1270,22 +1299,21 @@ public: | |||||||
|                 llama_pos n_past, |                 llama_pos n_past, | ||||||
|                 int32_t seq_id, |                 int32_t seq_id, | ||||||
|                 llama_pos & n_pos_out) { |                 llama_pos & n_pos_out) { | ||||||
|         auto it = map_pos_to_image.find(n_past); |         auto & chunk = find_chunk(n_past); | ||||||
|         if (it == map_pos_to_image.end()) { |         const char * name = mtmd_input_chunk_get_type(chunk.get()) == MTMD_INPUT_CHUNK_TYPE_IMAGE | ||||||
|             throw std::runtime_error("Chunk not found"); |                             ? "image" : "audio"; | ||||||
|         } |         SRV_INF("processing %s...\n", name); | ||||||
|         SRV_INF("%s\n", "processing image..."); |  | ||||||
|         int32_t n_batch = llama_n_batch(ctx); |         int32_t n_batch = llama_n_batch(ctx); | ||||||
|         int64_t t0 = ggml_time_ms(); |         int64_t t0 = ggml_time_ms(); | ||||||
|         llama_pos new_n_past = n_past; |         llama_pos new_n_past = n_past; | ||||||
|         int32_t result = mtmd_helper_eval_chunk_single(mctx, ctx, |         int32_t result = mtmd_helper_eval_chunk_single(mctx, ctx, | ||||||
|             it->second.get(), // chunk |             chunk.get(), | ||||||
|             n_past, |             n_past, | ||||||
|             seq_id, |             seq_id, | ||||||
|             n_batch, |             n_batch, | ||||||
|             true, // logits last |             true, // logits last | ||||||
|             &new_n_past); |             &new_n_past); | ||||||
|         SRV_INF("image processed in %" PRId64 " ms\n", ggml_time_ms() - t0); |         SRV_INF("%s processed in %" PRId64 " ms\n", name, ggml_time_ms() - t0); | ||||||
|         if (result != 0) { |         if (result != 0) { | ||||||
|             LOG_ERR("mtmd_helper_eval failed with status %d", result); |             LOG_ERR("mtmd_helper_eval failed with status %d", result); | ||||||
|             n_pos_out = n_past; |             n_pos_out = n_past; | ||||||
|   | |||||||
| @@ -1,4 +1,8 @@ | |||||||
| import { DocumentTextIcon, XMarkIcon } from '@heroicons/react/24/outline'; | import { | ||||||
|  |   DocumentTextIcon, | ||||||
|  |   SpeakerWaveIcon, | ||||||
|  |   XMarkIcon, | ||||||
|  | } from '@heroicons/react/24/outline'; | ||||||
| import { MessageExtra } from '../utils/types'; | import { MessageExtra } from '../utils/types'; | ||||||
| import { useState } from 'react'; | import { useState } from 'react'; | ||||||
| import { classNames } from '../utils/misc'; | import { classNames } from '../utils/misc'; | ||||||
| @@ -66,7 +70,11 @@ export default function ChatInputExtraContextItem({ | |||||||
|                   className="w-14 h-14 flex items-center justify-center" |                   className="w-14 h-14 flex items-center justify-center" | ||||||
|                   aria-description="Document icon" |                   aria-description="Document icon" | ||||||
|                 > |                 > | ||||||
|                   <DocumentTextIcon className="h-8 w-14 text-base-content/50" /> |                   {item.type === 'audioFile' ? ( | ||||||
|  |                     <SpeakerWaveIcon className="h-8 w-8 text-gray-500" /> | ||||||
|  |                   ) : ( | ||||||
|  |                     <DocumentTextIcon className="h-8 w-8 text-gray-500" /> | ||||||
|  |                   )} | ||||||
|                 </div> |                 </div> | ||||||
|  |  | ||||||
|                 <div className="text-xs pr-4"> |                 <div className="text-xs pr-4"> | ||||||
| @@ -98,6 +106,19 @@ export default function ChatInputExtraContextItem({ | |||||||
|                 src={showingItem.base64Url} |                 src={showingItem.base64Url} | ||||||
|                 alt={`Preview image for ${showingItem.name}`} |                 alt={`Preview image for ${showingItem.name}`} | ||||||
|               /> |               /> | ||||||
|  |             ) : showingItem.type === 'audioFile' ? ( | ||||||
|  |               <audio | ||||||
|  |                 controls | ||||||
|  |                 className="w-full" | ||||||
|  |                 aria-description={`Audio file ${showingItem.name}`} | ||||||
|  |               > | ||||||
|  |                 <source | ||||||
|  |                   src={`data:${showingItem.mimeType};base64,${showingItem.base64Data}`} | ||||||
|  |                   type={showingItem.mimeType} | ||||||
|  |                   aria-description={`Audio file ${showingItem.name}`} | ||||||
|  |                 /> | ||||||
|  |                 Your browser does not support the audio element. | ||||||
|  |               </audio> | ||||||
|             ) : ( |             ) : ( | ||||||
|               <div className="overflow-x-auto"> |               <div className="overflow-x-auto"> | ||||||
|                 <pre className="whitespace-pre-wrap break-words text-sm"> |                 <pre className="whitespace-pre-wrap break-words text-sm"> | ||||||
|   | |||||||
| @@ -278,6 +278,13 @@ export default function ChatScreen() { | |||||||
|  |  | ||||||
| function ServerInfo() { | function ServerInfo() { | ||||||
|   const { serverProps } = useAppContext(); |   const { serverProps } = useAppContext(); | ||||||
|  |   const modalities = []; | ||||||
|  |   if (serverProps?.modalities?.audio) { | ||||||
|  |     modalities.push('audio'); | ||||||
|  |   } | ||||||
|  |   if (serverProps?.modalities?.vision) { | ||||||
|  |     modalities.push('vision'); | ||||||
|  |   } | ||||||
|   return ( |   return ( | ||||||
|     <div |     <div | ||||||
|       className="card card-sm shadow-sm border-1 border-base-content/20 text-base-content/70 mb-6" |       className="card card-sm shadow-sm border-1 border-base-content/20 text-base-content/70 mb-6" | ||||||
| @@ -291,6 +298,13 @@ function ServerInfo() { | |||||||
|           <br /> |           <br /> | ||||||
|           <b>Build</b>: {serverProps?.build_info} |           <b>Build</b>: {serverProps?.build_info} | ||||||
|           <br /> |           <br /> | ||||||
|  |           {modalities.length > 0 ? ( | ||||||
|  |             <> | ||||||
|  |               <b>Supported modalities:</b> {modalities.join(', ')} | ||||||
|  |             </> | ||||||
|  |           ) : ( | ||||||
|  |             '' | ||||||
|  |           )} | ||||||
|         </p> |         </p> | ||||||
|       </div> |       </div> | ||||||
|     </div> |     </div> | ||||||
|   | |||||||
| @@ -11,6 +11,7 @@ pdfjs.GlobalWorkerOptions.workerSrc = pdfjsWorkerSrc; | |||||||
| // This file handles uploading extra context items (a.k.a files) | // This file handles uploading extra context items (a.k.a files) | ||||||
| // It allows processing these kinds of files: | // It allows processing these kinds of files: | ||||||
| // - image files (converted to base64) | // - image files (converted to base64) | ||||||
|  | // - audio files (converted to base64) | ||||||
| // - text files (including code files) | // - text files (including code files) | ||||||
| // - pdf (converted to text) | // - pdf (converted to text) | ||||||
|  |  | ||||||
| @@ -41,10 +42,10 @@ export function useChatExtraContext(): ChatExtraContextApi { | |||||||
|  |  | ||||||
|   const isSupportVision = serverProps?.modalities?.vision; |   const isSupportVision = serverProps?.modalities?.vision; | ||||||
|  |  | ||||||
|   const onFileAdded = (files: File[]) => { |   const onFileAdded = async (files: File[]) => { | ||||||
|  |     try { | ||||||
|       for (const file of files) { |       for (const file of files) { | ||||||
|         const mimeType = file.type; |         const mimeType = file.type; | ||||||
|       console.debug({ mimeType, file }); |  | ||||||
|         if (file.size > 10 * 1024 * 1024) { |         if (file.size > 10 * 1024 * 1024) { | ||||||
|           toast.error('File is too large. Maximum size is 10MB.'); |           toast.error('File is too large. Maximum size is 10MB.'); | ||||||
|           break; |           break; | ||||||
| @@ -55,16 +56,12 @@ export function useChatExtraContext(): ChatExtraContextApi { | |||||||
|             toast.error('Multimodal is not supported by this server or model.'); |             toast.error('Multimodal is not supported by this server or model.'); | ||||||
|             break; |             break; | ||||||
|           } |           } | ||||||
|         const reader = new FileReader(); |  | ||||||
|         reader.onload = async (event) => { |  | ||||||
|           if (event.target?.result) { |  | ||||||
|             let base64Url = event.target.result as string; |  | ||||||
|  |  | ||||||
|  |           let base64Url = await getFileAsBase64(file); | ||||||
|           if (mimeType === 'image/svg+xml') { |           if (mimeType === 'image/svg+xml') { | ||||||
|             // Convert SVG to PNG |             // Convert SVG to PNG | ||||||
|             base64Url = await svgBase64UrlToPngDataURL(base64Url); |             base64Url = await svgBase64UrlToPngDataURL(base64Url); | ||||||
|           } |           } | ||||||
|  |  | ||||||
|           addItems([ |           addItems([ | ||||||
|             { |             { | ||||||
|               type: 'imageFile', |               type: 'imageFile', | ||||||
| @@ -72,15 +69,25 @@ export function useChatExtraContext(): ChatExtraContextApi { | |||||||
|               base64Url, |               base64Url, | ||||||
|             }, |             }, | ||||||
|           ]); |           ]); | ||||||
|           } |         } else if (mimeType.startsWith('video/')) { | ||||||
|         }; |           toast.error('Video files are not supported yet.'); | ||||||
|         reader.readAsDataURL(file); |  | ||||||
|       } else if ( |  | ||||||
|         mimeType.startsWith('video/') || |  | ||||||
|         mimeType.startsWith('audio/') |  | ||||||
|       ) { |  | ||||||
|         toast.error('Video and audio files are not supported yet.'); |  | ||||||
|           break; |           break; | ||||||
|  |         } else if (mimeType.startsWith('audio/')) { | ||||||
|  |           if (!/mpeg|wav/.test(mimeType)) { | ||||||
|  |             toast.error('Only mp3 and wav audio files are supported.'); | ||||||
|  |             break; | ||||||
|  |           } | ||||||
|  |  | ||||||
|  |           // plain base64, not a data URL | ||||||
|  |           const base64Data = await getFileAsBase64(file, false); | ||||||
|  |           addItems([ | ||||||
|  |             { | ||||||
|  |               type: 'audioFile', | ||||||
|  |               name: file.name, | ||||||
|  |               mimeType, | ||||||
|  |               base64Data, | ||||||
|  |             }, | ||||||
|  |           ]); | ||||||
|         } else if (mimeType.startsWith('application/pdf')) { |         } else if (mimeType.startsWith('application/pdf')) { | ||||||
|           if (config.pdfAsImage && !isSupportVision) { |           if (config.pdfAsImage && !isSupportVision) { | ||||||
|             toast( |             toast( | ||||||
| @@ -89,9 +96,9 @@ export function useChatExtraContext(): ChatExtraContextApi { | |||||||
|             break; |             break; | ||||||
|           } |           } | ||||||
|  |  | ||||||
|         const promise = |           if (config.pdfAsImage && isSupportVision) { | ||||||
|           config.pdfAsImage && isSupportVision |             // Convert PDF to images | ||||||
|             ? convertPDFToImage(file).then((base64Urls) => { |             const base64Urls = await convertPDFToImage(file); | ||||||
|             addItems( |             addItems( | ||||||
|               base64Urls.map((base64Url) => ({ |               base64Urls.map((base64Url) => ({ | ||||||
|                 type: 'imageFile', |                 type: 'imageFile', | ||||||
| @@ -99,13 +106,9 @@ export function useChatExtraContext(): ChatExtraContextApi { | |||||||
|                 base64Url, |                 base64Url, | ||||||
|               })) |               })) | ||||||
|             ); |             ); | ||||||
|               }) |           } else { | ||||||
|             : convertPDFToText(file).then((content) => { |             // Convert PDF to text | ||||||
|                 if (isSupportVision) { |             const content = await convertPDFToText(file); | ||||||
|                   toast.success( |  | ||||||
|                     'PDF file converted to text. You can also convert it to image, see in Settings.' |  | ||||||
|                   ); |  | ||||||
|                 } |  | ||||||
|             addItems([ |             addItems([ | ||||||
|               { |               { | ||||||
|                 type: 'textFile', |                 type: 'textFile', | ||||||
| @@ -113,12 +116,12 @@ export function useChatExtraContext(): ChatExtraContextApi { | |||||||
|                 content, |                 content, | ||||||
|               }, |               }, | ||||||
|             ]); |             ]); | ||||||
|               }); |             if (isSupportVision) { | ||||||
|  |               toast.success( | ||||||
|         promise.catch((error) => { |                 'PDF file converted to text. You can also convert it to image, see in Settings.' | ||||||
|           console.error(error); |               ); | ||||||
|           toast.error('Failed to parse PDF file.'); |             } | ||||||
|         }); |           } | ||||||
|           break; |           break; | ||||||
|         } else { |         } else { | ||||||
|           // Because there can be many text file types (like code file), we will not check the mime type |           // Because there can be many text file types (like code file), we will not check the mime type | ||||||
| @@ -143,6 +146,11 @@ export function useChatExtraContext(): ChatExtraContextApi { | |||||||
|           reader.readAsText(file); |           reader.readAsText(file); | ||||||
|         } |         } | ||||||
|       } |       } | ||||||
|  |     } catch (error) { | ||||||
|  |       const message = error instanceof Error ? error.message : String(error); | ||||||
|  |       const errorMessage = `Error processing file: ${message}`; | ||||||
|  |       toast.error(errorMessage); | ||||||
|  |     } | ||||||
|   }; |   }; | ||||||
|  |  | ||||||
|   return { |   return { | ||||||
| @@ -154,6 +162,25 @@ export function useChatExtraContext(): ChatExtraContextApi { | |||||||
|   }; |   }; | ||||||
| } | } | ||||||
|  |  | ||||||
|  | async function getFileAsBase64(file: File, outputUrl = true): Promise<string> { | ||||||
|  |   return new Promise((resolve, reject) => { | ||||||
|  |     const reader = new FileReader(); | ||||||
|  |     reader.onload = (event) => { | ||||||
|  |       if (event.target?.result) { | ||||||
|  |         let result = event.target.result as string; | ||||||
|  |         if (!outputUrl) { | ||||||
|  |           // remove base64 url prefix and correct characters | ||||||
|  |           result = result.substring(result.indexOf(',') + 1); | ||||||
|  |         } | ||||||
|  |         resolve(result); | ||||||
|  |       } else { | ||||||
|  |         reject(new Error('Failed to read file.')); | ||||||
|  |       } | ||||||
|  |     }; | ||||||
|  |     reader.readAsDataURL(file); | ||||||
|  |   }); | ||||||
|  | } | ||||||
|  |  | ||||||
| async function getFileAsBuffer(file: File): Promise<ArrayBuffer> { | async function getFileAsBuffer(file: File): Promise<ArrayBuffer> { | ||||||
|   return new Promise((resolve, reject) => { |   return new Promise((resolve, reject) => { | ||||||
|     const reader = new FileReader(); |     const reader = new FileReader(); | ||||||
|   | |||||||
| @@ -89,6 +89,14 @@ export function normalizeMsgsForAPI(messages: Readonly<Message[]>) { | |||||||
|           type: 'image_url', |           type: 'image_url', | ||||||
|           image_url: { url: extra.base64Url }, |           image_url: { url: extra.base64Url }, | ||||||
|         }); |         }); | ||||||
|  |       } else if (extra.type === 'audioFile') { | ||||||
|  |         contentArr.push({ | ||||||
|  |           type: 'input_audio', | ||||||
|  |           input_audio: { | ||||||
|  |             data: extra.base64Data, | ||||||
|  |             format: /wav/.test(extra.mimeType) ? 'wav' : 'mp3', | ||||||
|  |           }, | ||||||
|  |         }); | ||||||
|       } else { |       } else { | ||||||
|         throw new Error('Unknown extra type'); |         throw new Error('Unknown extra type'); | ||||||
|       } |       } | ||||||
|   | |||||||
| @@ -51,6 +51,7 @@ export interface Message { | |||||||
| export type MessageExtra = | export type MessageExtra = | ||||||
|   | MessageExtraTextFile |   | MessageExtraTextFile | ||||||
|   | MessageExtraImageFile |   | MessageExtraImageFile | ||||||
|  |   | MessageExtraAudioFile | ||||||
|   | MessageExtraContext; |   | MessageExtraContext; | ||||||
|  |  | ||||||
| export interface MessageExtraTextFile { | export interface MessageExtraTextFile { | ||||||
| @@ -65,6 +66,13 @@ export interface MessageExtraImageFile { | |||||||
|   base64Url: string; |   base64Url: string; | ||||||
| } | } | ||||||
|  |  | ||||||
|  | export interface MessageExtraAudioFile { | ||||||
|  |   type: 'audioFile'; | ||||||
|  |   name: string; | ||||||
|  |   base64Data: string; | ||||||
|  |   mimeType: string; | ||||||
|  | } | ||||||
|  |  | ||||||
| export interface MessageExtraContext { | export interface MessageExtraContext { | ||||||
|   type: 'context'; |   type: 'context'; | ||||||
|   name: string; |   name: string; | ||||||
| @@ -79,6 +87,10 @@ export type APIMessageContentPart = | |||||||
|   | { |   | { | ||||||
|       type: 'image_url'; |       type: 'image_url'; | ||||||
|       image_url: { url: string }; |       image_url: { url: string }; | ||||||
|  |     } | ||||||
|  |   | { | ||||||
|  |       type: 'input_audio'; | ||||||
|  |       input_audio: { data: string; format: 'wav' | 'mp3' }; | ||||||
|     }; |     }; | ||||||
|  |  | ||||||
| export type APIMessage = { | export type APIMessage = { | ||||||
| @@ -120,6 +132,7 @@ export interface LlamaCppServerProps { | |||||||
|   n_ctx: number; |   n_ctx: number; | ||||||
|   modalities?: { |   modalities?: { | ||||||
|     vision: boolean; |     vision: boolean; | ||||||
|  |     audio: boolean; | ||||||
|   }; |   }; | ||||||
|   // TODO: support params |   // TODO: support params | ||||||
| } | } | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Xuan-Son Nguyen
					Xuan-Son Nguyen