mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-10-28 08:31:25 +00:00
Enhance text file detection logic for file attachments (#16199)
* feat: Enhances text file detection logic * chore: Build static `webui` output * chore: update webui build output
This commit is contained in:
committed by
GitHub
parent
1a18927894
commit
807e8c6d31
Binary file not shown.
14
tools/server/webui/src/lib/constants/binary-detection.ts
Normal file
14
tools/server/webui/src/lib/constants/binary-detection.ts
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
export interface BinaryDetectionOptions {
|
||||||
|
/** Number of characters to check from the beginning of the file */
|
||||||
|
prefixLength: number;
|
||||||
|
/** Maximum ratio of suspicious characters allowed (0.0 to 1.0) */
|
||||||
|
suspiciousCharThresholdRatio: number;
|
||||||
|
/** Maximum absolute number of null bytes allowed */
|
||||||
|
maxAbsoluteNullBytes: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
export const DEFAULT_BINARY_DETECTION_OPTIONS: BinaryDetectionOptions = {
|
||||||
|
prefixLength: 1024 * 10, // Check the first 10KB of the string
|
||||||
|
suspiciousCharThresholdRatio: 0.15, // Allow up to 15% suspicious chars
|
||||||
|
maxAbsoluteNullBytes: 2
|
||||||
|
};
|
||||||
@@ -176,5 +176,13 @@ export const TEXT_FILE_TYPES = {
|
|||||||
[FileTypeText.SVELTE]: {
|
[FileTypeText.SVELTE]: {
|
||||||
extensions: [FileExtensionText.SVELTE],
|
extensions: [FileExtensionText.SVELTE],
|
||||||
mimeTypes: [MimeTypeText.SVELTE]
|
mimeTypes: [MimeTypeText.SVELTE]
|
||||||
|
},
|
||||||
|
[FileTypeText.LATEX]: {
|
||||||
|
extensions: [FileExtensionText.TEX],
|
||||||
|
mimeTypes: [MimeTypeText.LATEX]
|
||||||
|
},
|
||||||
|
[FileTypeText.BIBTEX]: {
|
||||||
|
extensions: [FileExtensionText.BIB],
|
||||||
|
mimeTypes: [MimeTypeText.BIBTEX]
|
||||||
}
|
}
|
||||||
} as const;
|
} as const;
|
||||||
|
|||||||
@@ -59,7 +59,9 @@ export enum FileTypeText {
|
|||||||
SWIFT = 'swift',
|
SWIFT = 'swift',
|
||||||
DART = 'dart',
|
DART = 'dart',
|
||||||
VUE = 'vue',
|
VUE = 'vue',
|
||||||
SVELTE = 'svelte'
|
SVELTE = 'svelte',
|
||||||
|
LATEX = 'latex',
|
||||||
|
BIBTEX = 'bibtex'
|
||||||
}
|
}
|
||||||
|
|
||||||
// File extension enums
|
// File extension enums
|
||||||
@@ -115,7 +117,9 @@ export enum FileExtensionText {
|
|||||||
SWIFT = '.swift',
|
SWIFT = '.swift',
|
||||||
DART = '.dart',
|
DART = '.dart',
|
||||||
VUE = '.vue',
|
VUE = '.vue',
|
||||||
SVELTE = '.svelte'
|
SVELTE = '.svelte',
|
||||||
|
TEX = '.tex',
|
||||||
|
BIB = '.bib'
|
||||||
}
|
}
|
||||||
|
|
||||||
// MIME type enums
|
// MIME type enums
|
||||||
@@ -174,5 +178,7 @@ export enum MimeTypeText {
|
|||||||
SWIFT = 'text/x-swift',
|
SWIFT = 'text/x-swift',
|
||||||
DART = 'text/x-dart',
|
DART = 'text/x-dart',
|
||||||
VUE = 'text/x-vue',
|
VUE = 'text/x-vue',
|
||||||
SVELTE = 'text/x-svelte'
|
SVELTE = 'text/x-svelte',
|
||||||
|
LATEX = 'text/x-tex',
|
||||||
|
BIBTEX = 'text/x-bibtex'
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3,6 +3,10 @@
|
|||||||
* Handles text file detection, reading, and validation
|
* Handles text file detection, reading, and validation
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import {
|
||||||
|
DEFAULT_BINARY_DETECTION_OPTIONS,
|
||||||
|
type BinaryDetectionOptions
|
||||||
|
} from '$lib/constants/binary-detection';
|
||||||
import { FileExtensionText } from '$lib/enums/files';
|
import { FileExtensionText } from '$lib/enums/files';
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -43,41 +47,51 @@ export async function readFileAsText(file: File): Promise<string> {
|
|||||||
* Heuristic check to determine if content is likely from a text file
|
* Heuristic check to determine if content is likely from a text file
|
||||||
* Detects binary files by counting suspicious characters and null bytes
|
* Detects binary files by counting suspicious characters and null bytes
|
||||||
* @param content - The file content to analyze
|
* @param content - The file content to analyze
|
||||||
|
* @param options - Optional configuration for detection parameters
|
||||||
* @returns True if the content appears to be text-based
|
* @returns True if the content appears to be text-based
|
||||||
*/
|
*/
|
||||||
export function isLikelyTextFile(content: string): boolean {
|
export function isLikelyTextFile(
|
||||||
|
content: string,
|
||||||
|
options: Partial<BinaryDetectionOptions> = {}
|
||||||
|
): boolean {
|
||||||
if (!content) return true;
|
if (!content) return true;
|
||||||
|
|
||||||
const sample = content.substring(0, 1000);
|
const config = { ...DEFAULT_BINARY_DETECTION_OPTIONS, ...options };
|
||||||
|
const sample = content.substring(0, config.prefixLength);
|
||||||
|
|
||||||
let suspiciousCount = 0;
|
|
||||||
let nullCount = 0;
|
let nullCount = 0;
|
||||||
|
let suspiciousControlCount = 0;
|
||||||
|
|
||||||
for (let i = 0; i < sample.length; i++) {
|
for (let i = 0; i < sample.length; i++) {
|
||||||
const charCode = sample.charCodeAt(i);
|
const charCode = sample.charCodeAt(i);
|
||||||
|
|
||||||
// Count null bytes
|
// Count null bytes - these are strong indicators of binary files
|
||||||
if (charCode === 0) {
|
if (charCode === 0) {
|
||||||
nullCount++;
|
nullCount++;
|
||||||
suspiciousCount++;
|
|
||||||
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Count suspicious control characters (excluding common ones like tab, newline, carriage return)
|
// Count suspicious control characters
|
||||||
|
// Allow common whitespace characters: tab (9), newline (10), carriage return (13)
|
||||||
if (charCode < 32 && charCode !== 9 && charCode !== 10 && charCode !== 13) {
|
if (charCode < 32 && charCode !== 9 && charCode !== 10 && charCode !== 13) {
|
||||||
suspiciousCount++;
|
// Count most suspicious control characters
|
||||||
|
if (charCode < 8 || (charCode > 13 && charCode < 27)) {
|
||||||
|
suspiciousControlCount++;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Count replacement characters (indicates encoding issues)
|
// Count replacement characters (indicates encoding issues)
|
||||||
if (charCode === 0xfffd) {
|
if (charCode === 0xfffd) {
|
||||||
suspiciousCount++;
|
suspiciousControlCount++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Reject if too many null bytes or suspicious characters
|
// Reject if too many null bytes
|
||||||
if (nullCount > 2) return false;
|
if (nullCount > config.maxAbsoluteNullBytes) return false;
|
||||||
if (suspiciousCount / sample.length > 0.1) return false;
|
|
||||||
|
// Reject if too many suspicious characters
|
||||||
|
if (suspiciousControlCount / sample.length > config.suspiciousCharThresholdRatio) return false;
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user