mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-11-03 09:22:01 +00:00 
			
		
		
		
	server : support audio input (#13714)
* server : support audio input * add audio support on webui
This commit is contained in:
		@@ -1,4 +1,8 @@
 | 
			
		||||
import { DocumentTextIcon, XMarkIcon } from '@heroicons/react/24/outline';
 | 
			
		||||
import {
 | 
			
		||||
  DocumentTextIcon,
 | 
			
		||||
  SpeakerWaveIcon,
 | 
			
		||||
  XMarkIcon,
 | 
			
		||||
} from '@heroicons/react/24/outline';
 | 
			
		||||
import { MessageExtra } from '../utils/types';
 | 
			
		||||
import { useState } from 'react';
 | 
			
		||||
import { classNames } from '../utils/misc';
 | 
			
		||||
@@ -66,7 +70,11 @@ export default function ChatInputExtraContextItem({
 | 
			
		||||
                  className="w-14 h-14 flex items-center justify-center"
 | 
			
		||||
                  aria-description="Document icon"
 | 
			
		||||
                >
 | 
			
		||||
                  <DocumentTextIcon className="h-8 w-14 text-base-content/50" />
 | 
			
		||||
                  {item.type === 'audioFile' ? (
 | 
			
		||||
                    <SpeakerWaveIcon className="h-8 w-8 text-gray-500" />
 | 
			
		||||
                  ) : (
 | 
			
		||||
                    <DocumentTextIcon className="h-8 w-8 text-gray-500" />
 | 
			
		||||
                  )}
 | 
			
		||||
                </div>
 | 
			
		||||
 | 
			
		||||
                <div className="text-xs pr-4">
 | 
			
		||||
@@ -98,6 +106,19 @@ export default function ChatInputExtraContextItem({
 | 
			
		||||
                src={showingItem.base64Url}
 | 
			
		||||
                alt={`Preview image for ${showingItem.name}`}
 | 
			
		||||
              />
 | 
			
		||||
            ) : showingItem.type === 'audioFile' ? (
 | 
			
		||||
              <audio
 | 
			
		||||
                controls
 | 
			
		||||
                className="w-full"
 | 
			
		||||
                aria-description={`Audio file ${showingItem.name}`}
 | 
			
		||||
              >
 | 
			
		||||
                <source
 | 
			
		||||
                  src={`data:${showingItem.mimeType};base64,${showingItem.base64Data}`}
 | 
			
		||||
                  type={showingItem.mimeType}
 | 
			
		||||
                  aria-description={`Audio file ${showingItem.name}`}
 | 
			
		||||
                />
 | 
			
		||||
                Your browser does not support the audio element.
 | 
			
		||||
              </audio>
 | 
			
		||||
            ) : (
 | 
			
		||||
              <div className="overflow-x-auto">
 | 
			
		||||
                <pre className="whitespace-pre-wrap break-words text-sm">
 | 
			
		||||
 
 | 
			
		||||
@@ -278,6 +278,13 @@ export default function ChatScreen() {
 | 
			
		||||
 | 
			
		||||
function ServerInfo() {
 | 
			
		||||
  const { serverProps } = useAppContext();
 | 
			
		||||
  const modalities = [];
 | 
			
		||||
  if (serverProps?.modalities?.audio) {
 | 
			
		||||
    modalities.push('audio');
 | 
			
		||||
  }
 | 
			
		||||
  if (serverProps?.modalities?.vision) {
 | 
			
		||||
    modalities.push('vision');
 | 
			
		||||
  }
 | 
			
		||||
  return (
 | 
			
		||||
    <div
 | 
			
		||||
      className="card card-sm shadow-sm border-1 border-base-content/20 text-base-content/70 mb-6"
 | 
			
		||||
@@ -291,6 +298,13 @@ function ServerInfo() {
 | 
			
		||||
          <br />
 | 
			
		||||
          <b>Build</b>: {serverProps?.build_info}
 | 
			
		||||
          <br />
 | 
			
		||||
          {modalities.length > 0 ? (
 | 
			
		||||
            <>
 | 
			
		||||
              <b>Supported modalities:</b> {modalities.join(', ')}
 | 
			
		||||
            </>
 | 
			
		||||
          ) : (
 | 
			
		||||
            ''
 | 
			
		||||
          )}
 | 
			
		||||
        </p>
 | 
			
		||||
      </div>
 | 
			
		||||
    </div>
 | 
			
		||||
 
 | 
			
		||||
@@ -11,6 +11,7 @@ pdfjs.GlobalWorkerOptions.workerSrc = pdfjsWorkerSrc;
 | 
			
		||||
// This file handles uploading extra context items (a.k.a files)
 | 
			
		||||
// It allows processing these kinds of files:
 | 
			
		||||
// - image files (converted to base64)
 | 
			
		||||
// - audio files (converted to base64)
 | 
			
		||||
// - text files (including code files)
 | 
			
		||||
// - pdf (converted to text)
 | 
			
		||||
 | 
			
		||||
@@ -41,96 +42,73 @@ export function useChatExtraContext(): ChatExtraContextApi {
 | 
			
		||||
 | 
			
		||||
  const isSupportVision = serverProps?.modalities?.vision;
 | 
			
		||||
 | 
			
		||||
  const onFileAdded = (files: File[]) => {
 | 
			
		||||
    for (const file of files) {
 | 
			
		||||
      const mimeType = file.type;
 | 
			
		||||
      console.debug({ mimeType, file });
 | 
			
		||||
      if (file.size > 10 * 1024 * 1024) {
 | 
			
		||||
        toast.error('File is too large. Maximum size is 10MB.');
 | 
			
		||||
        break;
 | 
			
		||||
      }
 | 
			
		||||
 | 
			
		||||
      if (mimeType.startsWith('image/')) {
 | 
			
		||||
        if (!isSupportVision) {
 | 
			
		||||
          toast.error('Multimodal is not supported by this server or model.');
 | 
			
		||||
  const onFileAdded = async (files: File[]) => {
 | 
			
		||||
    try {
 | 
			
		||||
      for (const file of files) {
 | 
			
		||||
        const mimeType = file.type;
 | 
			
		||||
        if (file.size > 10 * 1024 * 1024) {
 | 
			
		||||
          toast.error('File is too large. Maximum size is 10MB.');
 | 
			
		||||
          break;
 | 
			
		||||
        }
 | 
			
		||||
        const reader = new FileReader();
 | 
			
		||||
        reader.onload = async (event) => {
 | 
			
		||||
          if (event.target?.result) {
 | 
			
		||||
            let base64Url = event.target.result as string;
 | 
			
		||||
 | 
			
		||||
            if (mimeType === 'image/svg+xml') {
 | 
			
		||||
              // Convert SVG to PNG
 | 
			
		||||
              base64Url = await svgBase64UrlToPngDataURL(base64Url);
 | 
			
		||||
            }
 | 
			
		||||
        if (mimeType.startsWith('image/')) {
 | 
			
		||||
          if (!isSupportVision) {
 | 
			
		||||
            toast.error('Multimodal is not supported by this server or model.');
 | 
			
		||||
            break;
 | 
			
		||||
          }
 | 
			
		||||
 | 
			
		||||
            addItems([
 | 
			
		||||
              {
 | 
			
		||||
          let base64Url = await getFileAsBase64(file);
 | 
			
		||||
          if (mimeType === 'image/svg+xml') {
 | 
			
		||||
            // Convert SVG to PNG
 | 
			
		||||
            base64Url = await svgBase64UrlToPngDataURL(base64Url);
 | 
			
		||||
          }
 | 
			
		||||
          addItems([
 | 
			
		||||
            {
 | 
			
		||||
              type: 'imageFile',
 | 
			
		||||
              name: file.name,
 | 
			
		||||
              base64Url,
 | 
			
		||||
            },
 | 
			
		||||
          ]);
 | 
			
		||||
        } else if (mimeType.startsWith('video/')) {
 | 
			
		||||
          toast.error('Video files are not supported yet.');
 | 
			
		||||
          break;
 | 
			
		||||
        } else if (mimeType.startsWith('audio/')) {
 | 
			
		||||
          if (!/mpeg|wav/.test(mimeType)) {
 | 
			
		||||
            toast.error('Only mp3 and wav audio files are supported.');
 | 
			
		||||
            break;
 | 
			
		||||
          }
 | 
			
		||||
 | 
			
		||||
          // plain base64, not a data URL
 | 
			
		||||
          const base64Data = await getFileAsBase64(file, false);
 | 
			
		||||
          addItems([
 | 
			
		||||
            {
 | 
			
		||||
              type: 'audioFile',
 | 
			
		||||
              name: file.name,
 | 
			
		||||
              mimeType,
 | 
			
		||||
              base64Data,
 | 
			
		||||
            },
 | 
			
		||||
          ]);
 | 
			
		||||
        } else if (mimeType.startsWith('application/pdf')) {
 | 
			
		||||
          if (config.pdfAsImage && !isSupportVision) {
 | 
			
		||||
            toast(
 | 
			
		||||
              'Multimodal is not supported, PDF will be converted to text instead of image.'
 | 
			
		||||
            );
 | 
			
		||||
            break;
 | 
			
		||||
          }
 | 
			
		||||
 | 
			
		||||
          if (config.pdfAsImage && isSupportVision) {
 | 
			
		||||
            // Convert PDF to images
 | 
			
		||||
            const base64Urls = await convertPDFToImage(file);
 | 
			
		||||
            addItems(
 | 
			
		||||
              base64Urls.map((base64Url) => ({
 | 
			
		||||
                type: 'imageFile',
 | 
			
		||||
                name: file.name,
 | 
			
		||||
                base64Url,
 | 
			
		||||
              },
 | 
			
		||||
            ]);
 | 
			
		||||
          }
 | 
			
		||||
        };
 | 
			
		||||
        reader.readAsDataURL(file);
 | 
			
		||||
      } else if (
 | 
			
		||||
        mimeType.startsWith('video/') ||
 | 
			
		||||
        mimeType.startsWith('audio/')
 | 
			
		||||
      ) {
 | 
			
		||||
        toast.error('Video and audio files are not supported yet.');
 | 
			
		||||
        break;
 | 
			
		||||
      } else if (mimeType.startsWith('application/pdf')) {
 | 
			
		||||
        if (config.pdfAsImage && !isSupportVision) {
 | 
			
		||||
          toast(
 | 
			
		||||
            'Multimodal is not supported, PDF will be converted to text instead of image.'
 | 
			
		||||
          );
 | 
			
		||||
          break;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        const promise =
 | 
			
		||||
          config.pdfAsImage && isSupportVision
 | 
			
		||||
            ? convertPDFToImage(file).then((base64Urls) => {
 | 
			
		||||
                addItems(
 | 
			
		||||
                  base64Urls.map((base64Url) => ({
 | 
			
		||||
                    type: 'imageFile',
 | 
			
		||||
                    name: file.name,
 | 
			
		||||
                    base64Url,
 | 
			
		||||
                  }))
 | 
			
		||||
                );
 | 
			
		||||
              })
 | 
			
		||||
            : convertPDFToText(file).then((content) => {
 | 
			
		||||
                if (isSupportVision) {
 | 
			
		||||
                  toast.success(
 | 
			
		||||
                    'PDF file converted to text. You can also convert it to image, see in Settings.'
 | 
			
		||||
                  );
 | 
			
		||||
                }
 | 
			
		||||
                addItems([
 | 
			
		||||
                  {
 | 
			
		||||
                    type: 'textFile',
 | 
			
		||||
                    name: file.name,
 | 
			
		||||
                    content,
 | 
			
		||||
                  },
 | 
			
		||||
                ]);
 | 
			
		||||
              });
 | 
			
		||||
 | 
			
		||||
        promise.catch((error) => {
 | 
			
		||||
          console.error(error);
 | 
			
		||||
          toast.error('Failed to parse PDF file.');
 | 
			
		||||
        });
 | 
			
		||||
        break;
 | 
			
		||||
      } else {
 | 
			
		||||
        // Because there can be many text file types (like code file), we will not check the mime type
 | 
			
		||||
        // and will just check if the file is not binary.
 | 
			
		||||
        const reader = new FileReader();
 | 
			
		||||
        reader.onload = (event) => {
 | 
			
		||||
          if (event.target?.result) {
 | 
			
		||||
            const content = event.target.result as string;
 | 
			
		||||
            if (!isLikelyNotBinary(content)) {
 | 
			
		||||
              toast.error('File is binary. Please upload a text file.');
 | 
			
		||||
              return;
 | 
			
		||||
            }
 | 
			
		||||
              }))
 | 
			
		||||
            );
 | 
			
		||||
          } else {
 | 
			
		||||
            // Convert PDF to text
 | 
			
		||||
            const content = await convertPDFToText(file);
 | 
			
		||||
            addItems([
 | 
			
		||||
              {
 | 
			
		||||
                type: 'textFile',
 | 
			
		||||
@@ -138,10 +116,40 @@ export function useChatExtraContext(): ChatExtraContextApi {
 | 
			
		||||
                content,
 | 
			
		||||
              },
 | 
			
		||||
            ]);
 | 
			
		||||
            if (isSupportVision) {
 | 
			
		||||
              toast.success(
 | 
			
		||||
                'PDF file converted to text. You can also convert it to image, see in Settings.'
 | 
			
		||||
              );
 | 
			
		||||
            }
 | 
			
		||||
          }
 | 
			
		||||
        };
 | 
			
		||||
        reader.readAsText(file);
 | 
			
		||||
          break;
 | 
			
		||||
        } else {
 | 
			
		||||
          // Because there can be many text file types (like code file), we will not check the mime type
 | 
			
		||||
          // and will just check if the file is not binary.
 | 
			
		||||
          const reader = new FileReader();
 | 
			
		||||
          reader.onload = (event) => {
 | 
			
		||||
            if (event.target?.result) {
 | 
			
		||||
              const content = event.target.result as string;
 | 
			
		||||
              if (!isLikelyNotBinary(content)) {
 | 
			
		||||
                toast.error('File is binary. Please upload a text file.');
 | 
			
		||||
                return;
 | 
			
		||||
              }
 | 
			
		||||
              addItems([
 | 
			
		||||
                {
 | 
			
		||||
                  type: 'textFile',
 | 
			
		||||
                  name: file.name,
 | 
			
		||||
                  content,
 | 
			
		||||
                },
 | 
			
		||||
              ]);
 | 
			
		||||
            }
 | 
			
		||||
          };
 | 
			
		||||
          reader.readAsText(file);
 | 
			
		||||
        }
 | 
			
		||||
      }
 | 
			
		||||
    } catch (error) {
 | 
			
		||||
      const message = error instanceof Error ? error.message : String(error);
 | 
			
		||||
      const errorMessage = `Error processing file: ${message}`;
 | 
			
		||||
      toast.error(errorMessage);
 | 
			
		||||
    }
 | 
			
		||||
  };
 | 
			
		||||
 | 
			
		||||
@@ -154,6 +162,25 @@ export function useChatExtraContext(): ChatExtraContextApi {
 | 
			
		||||
  };
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
async function getFileAsBase64(file: File, outputUrl = true): Promise<string> {
 | 
			
		||||
  return new Promise((resolve, reject) => {
 | 
			
		||||
    const reader = new FileReader();
 | 
			
		||||
    reader.onload = (event) => {
 | 
			
		||||
      if (event.target?.result) {
 | 
			
		||||
        let result = event.target.result as string;
 | 
			
		||||
        if (!outputUrl) {
 | 
			
		||||
          // remove base64 url prefix and correct characters
 | 
			
		||||
          result = result.substring(result.indexOf(',') + 1);
 | 
			
		||||
        }
 | 
			
		||||
        resolve(result);
 | 
			
		||||
      } else {
 | 
			
		||||
        reject(new Error('Failed to read file.'));
 | 
			
		||||
      }
 | 
			
		||||
    };
 | 
			
		||||
    reader.readAsDataURL(file);
 | 
			
		||||
  });
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
async function getFileAsBuffer(file: File): Promise<ArrayBuffer> {
 | 
			
		||||
  return new Promise((resolve, reject) => {
 | 
			
		||||
    const reader = new FileReader();
 | 
			
		||||
 
 | 
			
		||||
@@ -89,6 +89,14 @@ export function normalizeMsgsForAPI(messages: Readonly<Message[]>) {
 | 
			
		||||
          type: 'image_url',
 | 
			
		||||
          image_url: { url: extra.base64Url },
 | 
			
		||||
        });
 | 
			
		||||
      } else if (extra.type === 'audioFile') {
 | 
			
		||||
        contentArr.push({
 | 
			
		||||
          type: 'input_audio',
 | 
			
		||||
          input_audio: {
 | 
			
		||||
            data: extra.base64Data,
 | 
			
		||||
            format: /wav/.test(extra.mimeType) ? 'wav' : 'mp3',
 | 
			
		||||
          },
 | 
			
		||||
        });
 | 
			
		||||
      } else {
 | 
			
		||||
        throw new Error('Unknown extra type');
 | 
			
		||||
      }
 | 
			
		||||
 
 | 
			
		||||
@@ -51,6 +51,7 @@ export interface Message {
 | 
			
		||||
export type MessageExtra =
 | 
			
		||||
  | MessageExtraTextFile
 | 
			
		||||
  | MessageExtraImageFile
 | 
			
		||||
  | MessageExtraAudioFile
 | 
			
		||||
  | MessageExtraContext;
 | 
			
		||||
 | 
			
		||||
export interface MessageExtraTextFile {
 | 
			
		||||
@@ -65,6 +66,13 @@ export interface MessageExtraImageFile {
 | 
			
		||||
  base64Url: string;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
export interface MessageExtraAudioFile {
 | 
			
		||||
  type: 'audioFile';
 | 
			
		||||
  name: string;
 | 
			
		||||
  base64Data: string;
 | 
			
		||||
  mimeType: string;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
export interface MessageExtraContext {
 | 
			
		||||
  type: 'context';
 | 
			
		||||
  name: string;
 | 
			
		||||
@@ -79,6 +87,10 @@ export type APIMessageContentPart =
 | 
			
		||||
  | {
 | 
			
		||||
      type: 'image_url';
 | 
			
		||||
      image_url: { url: string };
 | 
			
		||||
    }
 | 
			
		||||
  | {
 | 
			
		||||
      type: 'input_audio';
 | 
			
		||||
      input_audio: { data: string; format: 'wav' | 'mp3' };
 | 
			
		||||
    };
 | 
			
		||||
 | 
			
		||||
export type APIMessage = {
 | 
			
		||||
@@ -120,6 +132,7 @@ export interface LlamaCppServerProps {
 | 
			
		||||
  n_ctx: number;
 | 
			
		||||
  modalities?: {
 | 
			
		||||
    vision: boolean;
 | 
			
		||||
    audio: boolean;
 | 
			
		||||
  };
 | 
			
		||||
  // TODO: support params
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user