Enhance text file detection logic for file attachments (#16199)

* feat: Enhances text file detection logic

* chore: Build static `webui` output

* chore: update webui build output
This commit is contained in:
Aleksander Grygier
2025-09-26 19:25:29 +02:00
committed by GitHub
parent 1a18927894
commit 807e8c6d31
5 changed files with 56 additions and 14 deletions

Binary file not shown.

View File

@@ -0,0 +1,14 @@
export interface BinaryDetectionOptions {
/** Number of characters to check from the beginning of the file */
prefixLength: number;
/** Maximum ratio of suspicious characters allowed (0.0 to 1.0) */
suspiciousCharThresholdRatio: number;
/** Maximum absolute number of null bytes allowed */
maxAbsoluteNullBytes: number;
}
export const DEFAULT_BINARY_DETECTION_OPTIONS: BinaryDetectionOptions = {
prefixLength: 1024 * 10, // Check the first 10KB of the string
suspiciousCharThresholdRatio: 0.15, // Allow up to 15% suspicious chars
maxAbsoluteNullBytes: 2
};

View File

@@ -176,5 +176,13 @@ export const TEXT_FILE_TYPES = {
[FileTypeText.SVELTE]: { [FileTypeText.SVELTE]: {
extensions: [FileExtensionText.SVELTE], extensions: [FileExtensionText.SVELTE],
mimeTypes: [MimeTypeText.SVELTE] mimeTypes: [MimeTypeText.SVELTE]
},
[FileTypeText.LATEX]: {
extensions: [FileExtensionText.TEX],
mimeTypes: [MimeTypeText.LATEX]
},
[FileTypeText.BIBTEX]: {
extensions: [FileExtensionText.BIB],
mimeTypes: [MimeTypeText.BIBTEX]
} }
} as const; } as const;

View File

@@ -59,7 +59,9 @@ export enum FileTypeText {
SWIFT = 'swift', SWIFT = 'swift',
DART = 'dart', DART = 'dart',
VUE = 'vue', VUE = 'vue',
SVELTE = 'svelte' SVELTE = 'svelte',
LATEX = 'latex',
BIBTEX = 'bibtex'
} }
// File extension enums // File extension enums
@@ -115,7 +117,9 @@ export enum FileExtensionText {
SWIFT = '.swift', SWIFT = '.swift',
DART = '.dart', DART = '.dart',
VUE = '.vue', VUE = '.vue',
SVELTE = '.svelte' SVELTE = '.svelte',
TEX = '.tex',
BIB = '.bib'
} }
// MIME type enums // MIME type enums
@@ -174,5 +178,7 @@ export enum MimeTypeText {
SWIFT = 'text/x-swift', SWIFT = 'text/x-swift',
DART = 'text/x-dart', DART = 'text/x-dart',
VUE = 'text/x-vue', VUE = 'text/x-vue',
SVELTE = 'text/x-svelte' SVELTE = 'text/x-svelte',
LATEX = 'text/x-tex',
BIBTEX = 'text/x-bibtex'
} }

View File

@@ -3,6 +3,10 @@
* Handles text file detection, reading, and validation * Handles text file detection, reading, and validation
*/ */
import {
DEFAULT_BINARY_DETECTION_OPTIONS,
type BinaryDetectionOptions
} from '$lib/constants/binary-detection';
import { FileExtensionText } from '$lib/enums/files'; import { FileExtensionText } from '$lib/enums/files';
/** /**
@@ -43,41 +47,51 @@ export async function readFileAsText(file: File): Promise<string> {
* Heuristic check to determine if content is likely from a text file * Heuristic check to determine if content is likely from a text file
* Detects binary files by counting suspicious characters and null bytes * Detects binary files by counting suspicious characters and null bytes
* @param content - The file content to analyze * @param content - The file content to analyze
* @param options - Optional configuration for detection parameters
* @returns True if the content appears to be text-based * @returns True if the content appears to be text-based
*/ */
export function isLikelyTextFile(content: string): boolean { export function isLikelyTextFile(
content: string,
options: Partial<BinaryDetectionOptions> = {}
): boolean {
if (!content) return true; if (!content) return true;
const sample = content.substring(0, 1000); const config = { ...DEFAULT_BINARY_DETECTION_OPTIONS, ...options };
const sample = content.substring(0, config.prefixLength);
let suspiciousCount = 0;
let nullCount = 0; let nullCount = 0;
let suspiciousControlCount = 0;
for (let i = 0; i < sample.length; i++) { for (let i = 0; i < sample.length; i++) {
const charCode = sample.charCodeAt(i); const charCode = sample.charCodeAt(i);
// Count null bytes // Count null bytes - these are strong indicators of binary files
if (charCode === 0) { if (charCode === 0) {
nullCount++; nullCount++;
suspiciousCount++;
continue; continue;
} }
// Count suspicious control characters (excluding common ones like tab, newline, carriage return) // Count suspicious control characters
// Allow common whitespace characters: tab (9), newline (10), carriage return (13)
if (charCode < 32 && charCode !== 9 && charCode !== 10 && charCode !== 13) { if (charCode < 32 && charCode !== 9 && charCode !== 10 && charCode !== 13) {
suspiciousCount++; // Count most suspicious control characters
if (charCode < 8 || (charCode > 13 && charCode < 27)) {
suspiciousControlCount++;
}
} }
// Count replacement characters (indicates encoding issues) // Count replacement characters (indicates encoding issues)
if (charCode === 0xfffd) { if (charCode === 0xfffd) {
suspiciousCount++; suspiciousControlCount++;
} }
} }
// Reject if too many null bytes or suspicious characters // Reject if too many null bytes
if (nullCount > 2) return false; if (nullCount > config.maxAbsoluteNullBytes) return false;
if (suspiciousCount / sample.length > 0.1) return false;
// Reject if too many suspicious characters
if (suspiciousControlCount / sample.length > config.suspiciousCharThresholdRatio) return false;
return true; return true;
} }