mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-10-27 08:21:30 +00:00
* fix: make SSE client robust to premature [DONE] in agentic proxy chains * webui: remove client-side context pre-check and rely on backend for limits Removed the client-side context window pre-check and now simply sends messages while keeping the dialog imports limited to core components, eliminating the maximum context alert path Simplified streaming and non-streaming chat error handling to surface a generic 'No response received from server' error whenever the backend returns no content Removed the obsolete maxContextError plumbing from the chat store so state management now focuses on the core message flow without special context-limit cases * webui: cosmetic rename of error messages * Update tools/server/webui/src/lib/stores/chat.svelte.ts Co-authored-by: Aleksander Grygier <aleksander.grygier@gmail.com> * Update tools/server/webui/src/lib/stores/chat.svelte.ts Co-authored-by: Aleksander Grygier <aleksander.grygier@gmail.com> * Update tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte Co-authored-by: Aleksander Grygier <aleksander.grygier@gmail.com> * Update tools/server/webui/src/lib/components/app/chat/ChatScreen/ChatScreen.svelte Co-authored-by: Aleksander Grygier <aleksander.grygier@gmail.com> * chore: update webui build output --------- Co-authored-by: Aleksander Grygier <aleksander.grygier@gmail.com>
617 lines
19 KiB
TypeScript
617 lines
19 KiB
TypeScript
import { config } from '$lib/stores/settings.svelte';
|
|
import { slotsService } from './slots';
|
|
/**
|
|
* ChatService - Low-level API communication layer for llama.cpp server interactions
|
|
*
|
|
* This service handles direct communication with the llama.cpp server's chat completion API.
|
|
* It provides the network layer abstraction for AI model interactions while remaining
|
|
* stateless and focused purely on API communication.
|
|
*
|
|
* **Architecture & Relationship with ChatStore:**
|
|
* - **ChatService** (this class): Stateless API communication layer
|
|
* - Handles HTTP requests/responses with llama.cpp server
|
|
* - Manages streaming and non-streaming response parsing
|
|
* - Provides request abortion capabilities
|
|
* - Converts database messages to API format
|
|
* - Handles error translation for server responses
|
|
*
|
|
* - **ChatStore**: Stateful orchestration and UI state management
|
|
* - Uses ChatService for all AI model communication
|
|
* - Manages conversation state, message history, and UI reactivity
|
|
* - Coordinates with DatabaseStore for persistence
|
|
* - Handles complex workflows like branching and regeneration
|
|
*
|
|
* **Key Responsibilities:**
|
|
* - Message format conversion (DatabaseMessage → API format)
|
|
* - Streaming response handling with real-time callbacks
|
|
* - Reasoning content extraction and processing
|
|
* - File attachment processing (images, PDFs, audio, text)
|
|
* - Request lifecycle management (abort, cleanup)
|
|
*/
|
|
export class ChatService {
|
|
private abortController: AbortController | null = null;
|
|
|
|
/**
|
|
* Sends a chat completion request to the llama.cpp server.
|
|
* Supports both streaming and non-streaming responses with comprehensive parameter configuration.
|
|
* Automatically converts database messages with attachments to the appropriate API format.
|
|
*
|
|
* @param messages - Array of chat messages to send to the API (supports both ApiChatMessageData and DatabaseMessage with attachments)
|
|
* @param options - Configuration options for the chat completion request. See `SettingsChatServiceOptions` type for details.
|
|
* @returns {Promise<string | void>} that resolves to the complete response string (non-streaming) or void (streaming)
|
|
* @throws {Error} if the request fails or is aborted
|
|
*/
|
|
async sendMessage(
|
|
messages: ApiChatMessageData[] | (DatabaseMessage & { extra?: DatabaseMessageExtra[] })[],
|
|
options: SettingsChatServiceOptions = {}
|
|
): Promise<string | void> {
|
|
const {
|
|
stream,
|
|
onChunk,
|
|
onComplete,
|
|
onError,
|
|
// Generation parameters
|
|
temperature,
|
|
max_tokens,
|
|
// Sampling parameters
|
|
dynatemp_range,
|
|
dynatemp_exponent,
|
|
top_k,
|
|
top_p,
|
|
min_p,
|
|
xtc_probability,
|
|
xtc_threshold,
|
|
typ_p,
|
|
// Penalty parameters
|
|
repeat_last_n,
|
|
repeat_penalty,
|
|
presence_penalty,
|
|
frequency_penalty,
|
|
dry_multiplier,
|
|
dry_base,
|
|
dry_allowed_length,
|
|
dry_penalty_last_n,
|
|
// Other parameters
|
|
samplers,
|
|
custom,
|
|
timings_per_token
|
|
} = options;
|
|
|
|
const currentConfig = config();
|
|
|
|
// Cancel any ongoing request and create a new abort controller
|
|
this.abort();
|
|
this.abortController = new AbortController();
|
|
|
|
// Convert database messages with attachments to API format if needed
|
|
const normalizedMessages: ApiChatMessageData[] = messages
|
|
.map((msg) => {
|
|
// Check if this is a DatabaseMessage by checking for DatabaseMessage-specific fields
|
|
if ('id' in msg && 'convId' in msg && 'timestamp' in msg) {
|
|
// This is a DatabaseMessage, convert it
|
|
const dbMsg = msg as DatabaseMessage & { extra?: DatabaseMessageExtra[] };
|
|
return ChatService.convertMessageToChatServiceData(dbMsg);
|
|
} else {
|
|
// This is already an ApiChatMessageData object
|
|
return msg as ApiChatMessageData;
|
|
}
|
|
})
|
|
.filter((msg) => {
|
|
// Filter out empty system messages
|
|
if (msg.role === 'system') {
|
|
const content = typeof msg.content === 'string' ? msg.content : '';
|
|
|
|
return content.trim().length > 0;
|
|
}
|
|
|
|
return true;
|
|
});
|
|
|
|
// Build base request body with system message injection
|
|
const processedMessages = this.injectSystemMessage(normalizedMessages);
|
|
|
|
const requestBody: ApiChatCompletionRequest = {
|
|
messages: processedMessages.map((msg: ApiChatMessageData) => ({
|
|
role: msg.role,
|
|
content: msg.content
|
|
})),
|
|
stream
|
|
};
|
|
|
|
requestBody.reasoning_format = currentConfig.disableReasoningFormat ? 'none' : 'auto';
|
|
|
|
if (temperature !== undefined) requestBody.temperature = temperature;
|
|
if (max_tokens !== undefined) {
|
|
// Set max_tokens to -1 (infinite) when explicitly configured as 0 or null
|
|
requestBody.max_tokens = max_tokens !== null && max_tokens !== 0 ? max_tokens : -1;
|
|
}
|
|
|
|
if (dynatemp_range !== undefined) requestBody.dynatemp_range = dynatemp_range;
|
|
if (dynatemp_exponent !== undefined) requestBody.dynatemp_exponent = dynatemp_exponent;
|
|
if (top_k !== undefined) requestBody.top_k = top_k;
|
|
if (top_p !== undefined) requestBody.top_p = top_p;
|
|
if (min_p !== undefined) requestBody.min_p = min_p;
|
|
if (xtc_probability !== undefined) requestBody.xtc_probability = xtc_probability;
|
|
if (xtc_threshold !== undefined) requestBody.xtc_threshold = xtc_threshold;
|
|
if (typ_p !== undefined) requestBody.typ_p = typ_p;
|
|
|
|
if (repeat_last_n !== undefined) requestBody.repeat_last_n = repeat_last_n;
|
|
if (repeat_penalty !== undefined) requestBody.repeat_penalty = repeat_penalty;
|
|
if (presence_penalty !== undefined) requestBody.presence_penalty = presence_penalty;
|
|
if (frequency_penalty !== undefined) requestBody.frequency_penalty = frequency_penalty;
|
|
if (dry_multiplier !== undefined) requestBody.dry_multiplier = dry_multiplier;
|
|
if (dry_base !== undefined) requestBody.dry_base = dry_base;
|
|
if (dry_allowed_length !== undefined) requestBody.dry_allowed_length = dry_allowed_length;
|
|
if (dry_penalty_last_n !== undefined) requestBody.dry_penalty_last_n = dry_penalty_last_n;
|
|
|
|
if (samplers !== undefined) {
|
|
requestBody.samplers =
|
|
typeof samplers === 'string'
|
|
? samplers.split(';').filter((s: string) => s.trim())
|
|
: samplers;
|
|
}
|
|
|
|
if (timings_per_token !== undefined) requestBody.timings_per_token = timings_per_token;
|
|
|
|
if (custom) {
|
|
try {
|
|
const customParams = typeof custom === 'string' ? JSON.parse(custom) : custom;
|
|
Object.assign(requestBody, customParams);
|
|
} catch (error) {
|
|
console.warn('Failed to parse custom parameters:', error);
|
|
}
|
|
}
|
|
|
|
try {
|
|
const apiKey = currentConfig.apiKey?.toString().trim();
|
|
|
|
const response = await fetch(`./v1/chat/completions`, {
|
|
method: 'POST',
|
|
headers: {
|
|
'Content-Type': 'application/json',
|
|
...(apiKey ? { Authorization: `Bearer ${apiKey}` } : {})
|
|
},
|
|
body: JSON.stringify(requestBody),
|
|
signal: this.abortController.signal
|
|
});
|
|
|
|
if (!response.ok) {
|
|
// Use the new parseErrorResponse method to handle structured errors
|
|
const error = await this.parseErrorResponse(response);
|
|
if (onError) {
|
|
onError(error);
|
|
}
|
|
throw error;
|
|
}
|
|
|
|
if (stream) {
|
|
return this.handleStreamResponse(
|
|
response,
|
|
onChunk,
|
|
onComplete,
|
|
onError,
|
|
options.onReasoningChunk
|
|
);
|
|
} else {
|
|
return this.handleNonStreamResponse(response, onComplete, onError);
|
|
}
|
|
} catch (error) {
|
|
if (error instanceof Error && error.name === 'AbortError') {
|
|
console.log('Chat completion request was aborted');
|
|
return;
|
|
}
|
|
|
|
let userFriendlyError: Error;
|
|
|
|
if (error instanceof Error) {
|
|
if (error.name === 'TypeError' && error.message.includes('fetch')) {
|
|
userFriendlyError = new Error(
|
|
'Unable to connect to server - please check if the server is running'
|
|
);
|
|
userFriendlyError.name = 'NetworkError';
|
|
} else if (error.message.includes('ECONNREFUSED')) {
|
|
userFriendlyError = new Error('Connection refused - server may be offline');
|
|
userFriendlyError.name = 'NetworkError';
|
|
} else if (error.message.includes('ETIMEDOUT')) {
|
|
userFriendlyError = new Error('Request timed out - the server took too long to respond');
|
|
userFriendlyError.name = 'TimeoutError';
|
|
} else {
|
|
userFriendlyError = error;
|
|
}
|
|
} else {
|
|
userFriendlyError = new Error('Unknown error occurred while sending message');
|
|
}
|
|
|
|
console.error('Error in sendMessage:', error);
|
|
if (onError) {
|
|
onError(userFriendlyError);
|
|
}
|
|
throw userFriendlyError;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Handles streaming response from the chat completion API.
|
|
* Processes server-sent events and extracts content chunks from the stream.
|
|
*
|
|
* @param response - The fetch Response object containing the streaming data
|
|
* @param onChunk - Optional callback invoked for each content chunk received
|
|
* @param onComplete - Optional callback invoked when the stream is complete with full response
|
|
* @param onError - Optional callback invoked if an error occurs during streaming
|
|
* @param onReasoningChunk - Optional callback invoked for each reasoning content chunk
|
|
* @returns {Promise<void>} Promise that resolves when streaming is complete
|
|
* @throws {Error} if the stream cannot be read or parsed
|
|
*/
|
|
private async handleStreamResponse(
|
|
response: Response,
|
|
onChunk?: (chunk: string) => void,
|
|
onComplete?: (
|
|
response: string,
|
|
reasoningContent?: string,
|
|
timings?: ChatMessageTimings
|
|
) => void,
|
|
onError?: (error: Error) => void,
|
|
onReasoningChunk?: (chunk: string) => void
|
|
): Promise<void> {
|
|
const reader = response.body?.getReader();
|
|
|
|
if (!reader) {
|
|
throw new Error('No response body');
|
|
}
|
|
|
|
const decoder = new TextDecoder();
|
|
let aggregatedContent = '';
|
|
let fullReasoningContent = '';
|
|
let hasReceivedData = false;
|
|
let lastTimings: ChatMessageTimings | undefined;
|
|
let streamFinished = false;
|
|
|
|
try {
|
|
let chunk = '';
|
|
while (true) {
|
|
const { done, value } = await reader.read();
|
|
if (done) break;
|
|
|
|
chunk += decoder.decode(value, { stream: true });
|
|
const lines = chunk.split('\n');
|
|
chunk = lines.pop() || ''; // Save incomplete line for next read
|
|
|
|
for (const line of lines) {
|
|
if (line.startsWith('data: ')) {
|
|
const data = line.slice(6);
|
|
if (data === '[DONE]') {
|
|
streamFinished = true;
|
|
continue;
|
|
}
|
|
|
|
try {
|
|
const parsed: ApiChatCompletionStreamChunk = JSON.parse(data);
|
|
|
|
const content = parsed.choices[0]?.delta?.content;
|
|
const reasoningContent = parsed.choices[0]?.delta?.reasoning_content;
|
|
const timings = parsed.timings;
|
|
const promptProgress = parsed.prompt_progress;
|
|
|
|
if (timings || promptProgress) {
|
|
this.updateProcessingState(timings, promptProgress);
|
|
|
|
// Store the latest timing data
|
|
if (timings) {
|
|
lastTimings = timings;
|
|
}
|
|
}
|
|
|
|
if (content) {
|
|
hasReceivedData = true;
|
|
aggregatedContent += content;
|
|
onChunk?.(content);
|
|
}
|
|
|
|
if (reasoningContent) {
|
|
hasReceivedData = true;
|
|
fullReasoningContent += reasoningContent;
|
|
onReasoningChunk?.(reasoningContent);
|
|
}
|
|
} catch (e) {
|
|
console.error('Error parsing JSON chunk:', e);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (streamFinished) {
|
|
if (!hasReceivedData && aggregatedContent.length === 0) {
|
|
const noResponseError = new Error('No response received from server. Please try again.');
|
|
throw noResponseError;
|
|
}
|
|
|
|
onComplete?.(aggregatedContent, fullReasoningContent || undefined, lastTimings);
|
|
}
|
|
} catch (error) {
|
|
const err = error instanceof Error ? error : new Error('Stream error');
|
|
|
|
onError?.(err);
|
|
|
|
throw err;
|
|
} finally {
|
|
reader.releaseLock();
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Handles non-streaming response from the chat completion API.
|
|
* Parses the JSON response and extracts the generated content.
|
|
*
|
|
* @param response - The fetch Response object containing the JSON data
|
|
* @param onComplete - Optional callback invoked when response is successfully parsed
|
|
* @param onError - Optional callback invoked if an error occurs during parsing
|
|
* @returns {Promise<string>} Promise that resolves to the generated content string
|
|
* @throws {Error} if the response cannot be parsed or is malformed
|
|
*/
|
|
private async handleNonStreamResponse(
|
|
response: Response,
|
|
onComplete?: (
|
|
response: string,
|
|
reasoningContent?: string,
|
|
timings?: ChatMessageTimings
|
|
) => void,
|
|
onError?: (error: Error) => void
|
|
): Promise<string> {
|
|
try {
|
|
const responseText = await response.text();
|
|
|
|
if (!responseText.trim()) {
|
|
const noResponseError = new Error('No response received from server. Please try again.');
|
|
throw noResponseError;
|
|
}
|
|
|
|
const data: ApiChatCompletionResponse = JSON.parse(responseText);
|
|
const content = data.choices[0]?.message?.content || '';
|
|
const reasoningContent = data.choices[0]?.message?.reasoning_content;
|
|
|
|
if (reasoningContent) {
|
|
console.log('Full reasoning content:', reasoningContent);
|
|
}
|
|
|
|
if (!content.trim()) {
|
|
const noResponseError = new Error('No response received from server. Please try again.');
|
|
throw noResponseError;
|
|
}
|
|
|
|
onComplete?.(content, reasoningContent);
|
|
|
|
return content;
|
|
} catch (error) {
|
|
const err = error instanceof Error ? error : new Error('Parse error');
|
|
|
|
onError?.(err);
|
|
|
|
throw err;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Converts a database message with attachments to API chat message format.
|
|
* Processes various attachment types (images, text files, PDFs) and formats them
|
|
* as content parts suitable for the chat completion API.
|
|
*
|
|
* @param message - Database message object with optional extra attachments
|
|
* @param message.content - The text content of the message
|
|
* @param message.role - The role of the message sender (user, assistant, system)
|
|
* @param message.extra - Optional array of message attachments (images, files, etc.)
|
|
* @returns {ApiChatMessageData} object formatted for the chat completion API
|
|
* @static
|
|
*/
|
|
static convertMessageToChatServiceData(
|
|
message: DatabaseMessage & { extra?: DatabaseMessageExtra[] }
|
|
): ApiChatMessageData {
|
|
if (!message.extra || message.extra.length === 0) {
|
|
return {
|
|
role: message.role as 'user' | 'assistant' | 'system',
|
|
content: message.content
|
|
};
|
|
}
|
|
|
|
const contentParts: ApiChatMessageContentPart[] = [];
|
|
|
|
if (message.content) {
|
|
contentParts.push({
|
|
type: 'text',
|
|
text: message.content
|
|
});
|
|
}
|
|
|
|
const imageFiles = message.extra.filter(
|
|
(extra: DatabaseMessageExtra): extra is DatabaseMessageExtraImageFile =>
|
|
extra.type === 'imageFile'
|
|
);
|
|
|
|
for (const image of imageFiles) {
|
|
contentParts.push({
|
|
type: 'image_url',
|
|
image_url: { url: image.base64Url }
|
|
});
|
|
}
|
|
|
|
const textFiles = message.extra.filter(
|
|
(extra: DatabaseMessageExtra): extra is DatabaseMessageExtraTextFile =>
|
|
extra.type === 'textFile'
|
|
);
|
|
|
|
for (const textFile of textFiles) {
|
|
contentParts.push({
|
|
type: 'text',
|
|
text: `\n\n--- File: ${textFile.name} ---\n${textFile.content}`
|
|
});
|
|
}
|
|
|
|
const audioFiles = message.extra.filter(
|
|
(extra: DatabaseMessageExtra): extra is DatabaseMessageExtraAudioFile =>
|
|
extra.type === 'audioFile'
|
|
);
|
|
|
|
for (const audio of audioFiles) {
|
|
contentParts.push({
|
|
type: 'input_audio',
|
|
input_audio: {
|
|
data: audio.base64Data,
|
|
format: audio.mimeType.includes('wav') ? 'wav' : 'mp3'
|
|
}
|
|
});
|
|
}
|
|
|
|
const pdfFiles = message.extra.filter(
|
|
(extra: DatabaseMessageExtra): extra is DatabaseMessageExtraPdfFile =>
|
|
extra.type === 'pdfFile'
|
|
);
|
|
|
|
for (const pdfFile of pdfFiles) {
|
|
if (pdfFile.processedAsImages && pdfFile.images) {
|
|
for (let i = 0; i < pdfFile.images.length; i++) {
|
|
contentParts.push({
|
|
type: 'image_url',
|
|
image_url: { url: pdfFile.images[i] }
|
|
});
|
|
}
|
|
} else {
|
|
contentParts.push({
|
|
type: 'text',
|
|
text: `\n\n--- PDF File: ${pdfFile.name} ---\n${pdfFile.content}`
|
|
});
|
|
}
|
|
}
|
|
|
|
return {
|
|
role: message.role as 'user' | 'assistant' | 'system',
|
|
content: contentParts
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Get server properties - static method for API compatibility
|
|
*/
|
|
static async getServerProps(): Promise<ApiLlamaCppServerProps> {
|
|
try {
|
|
const currentConfig = config();
|
|
const apiKey = currentConfig.apiKey?.toString().trim();
|
|
|
|
const response = await fetch(`./props`, {
|
|
headers: {
|
|
'Content-Type': 'application/json',
|
|
...(apiKey ? { Authorization: `Bearer ${apiKey}` } : {})
|
|
}
|
|
});
|
|
|
|
if (!response.ok) {
|
|
throw new Error(`Failed to fetch server props: ${response.status}`);
|
|
}
|
|
|
|
const data = await response.json();
|
|
return data;
|
|
} catch (error) {
|
|
console.error('Error fetching server props:', error);
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Aborts any ongoing chat completion request.
|
|
* Cancels the current request and cleans up the abort controller.
|
|
*
|
|
* @public
|
|
*/
|
|
public abort(): void {
|
|
if (this.abortController) {
|
|
this.abortController.abort();
|
|
this.abortController = null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Injects a system message at the beginning of the conversation if configured in settings.
|
|
* Checks for existing system messages to avoid duplication and retrieves the system message
|
|
* from the current configuration settings.
|
|
*
|
|
* @param messages - Array of chat messages to process
|
|
* @returns Array of messages with system message injected at the beginning if configured
|
|
* @private
|
|
*/
|
|
private injectSystemMessage(messages: ApiChatMessageData[]): ApiChatMessageData[] {
|
|
const currentConfig = config();
|
|
const systemMessage = currentConfig.systemMessage?.toString().trim();
|
|
|
|
if (!systemMessage) {
|
|
return messages;
|
|
}
|
|
|
|
if (messages.length > 0 && messages[0].role === 'system') {
|
|
if (messages[0].content !== systemMessage) {
|
|
const updatedMessages = [...messages];
|
|
updatedMessages[0] = {
|
|
role: 'system',
|
|
content: systemMessage
|
|
};
|
|
return updatedMessages;
|
|
}
|
|
|
|
return messages;
|
|
}
|
|
|
|
const systemMsg: ApiChatMessageData = {
|
|
role: 'system',
|
|
content: systemMessage
|
|
};
|
|
|
|
return [systemMsg, ...messages];
|
|
}
|
|
|
|
/**
|
|
* Parses error response and creates appropriate error with context information
|
|
* @param response - HTTP response object
|
|
* @returns Promise<Error> - Parsed error with context info if available
|
|
*/
|
|
private async parseErrorResponse(response: Response): Promise<Error> {
|
|
try {
|
|
const errorText = await response.text();
|
|
const errorData: ApiErrorResponse = JSON.parse(errorText);
|
|
|
|
const message = errorData.error?.message || 'Unknown server error';
|
|
const error = new Error(message);
|
|
error.name = response.status === 400 ? 'ServerError' : 'HttpError';
|
|
|
|
return error;
|
|
} catch {
|
|
// If we can't parse the error response, return a generic error
|
|
const fallback = new Error(`Server error (${response.status}): ${response.statusText}`);
|
|
fallback.name = 'HttpError';
|
|
return fallback;
|
|
}
|
|
}
|
|
|
|
private updateProcessingState(
|
|
timings?: ChatMessageTimings,
|
|
promptProgress?: ChatMessagePromptProgress
|
|
): void {
|
|
// Calculate tokens per second from timing data
|
|
const tokensPerSecond =
|
|
timings?.predicted_ms && timings?.predicted_n
|
|
? (timings.predicted_n / timings.predicted_ms) * 1000
|
|
: 0;
|
|
|
|
// Update slots service with timing data (async but don't wait)
|
|
slotsService
|
|
.updateFromTimingData({
|
|
prompt_n: timings?.prompt_n || 0,
|
|
predicted_n: timings?.predicted_n || 0,
|
|
predicted_per_second: tokensPerSecond,
|
|
cache_n: timings?.cache_n || 0,
|
|
prompt_progress: promptProgress
|
|
})
|
|
.catch((error) => {
|
|
console.warn('Failed to update processing state:', error);
|
|
});
|
|
}
|
|
}
|
|
|
|
export const chatService = new ChatService();
|