mirror of
				https://github.com/ggml-org/llama.cpp.git
				synced 2025-10-30 08:42:00 +00:00 
			
		
		
		
	Enhance text file detection logic for file attachments (#16199)
* feat: Enhances text file detection logic * chore: Build static `webui` output * chore: update webui build output
This commit is contained in:
		 Aleksander Grygier
					Aleksander Grygier
				
			
				
					committed by
					
						 GitHub
						GitHub
					
				
			
			
				
	
			
			
			 GitHub
						GitHub
					
				
			
						parent
						
							1a18927894
						
					
				
				
					commit
					807e8c6d31
				
			
										
											Binary file not shown.
										
									
								
							
							
								
								
									
										14
									
								
								tools/server/webui/src/lib/constants/binary-detection.ts
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										14
									
								
								tools/server/webui/src/lib/constants/binary-detection.ts
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,14 @@ | ||||
| export interface BinaryDetectionOptions { | ||||
| 	/** Number of characters to check from the beginning of the file */ | ||||
| 	prefixLength: number; | ||||
| 	/** Maximum ratio of suspicious characters allowed (0.0 to 1.0) */ | ||||
| 	suspiciousCharThresholdRatio: number; | ||||
| 	/** Maximum absolute number of null bytes allowed */ | ||||
| 	maxAbsoluteNullBytes: number; | ||||
| } | ||||
|  | ||||
| export const DEFAULT_BINARY_DETECTION_OPTIONS: BinaryDetectionOptions = { | ||||
| 	prefixLength: 1024 * 10, // Check the first 10KB of the string | ||||
| 	suspiciousCharThresholdRatio: 0.15, // Allow up to 15% suspicious chars | ||||
| 	maxAbsoluteNullBytes: 2 | ||||
| }; | ||||
| @@ -176,5 +176,13 @@ export const TEXT_FILE_TYPES = { | ||||
| 	[FileTypeText.SVELTE]: { | ||||
| 		extensions: [FileExtensionText.SVELTE], | ||||
| 		mimeTypes: [MimeTypeText.SVELTE] | ||||
| 	}, | ||||
| 	[FileTypeText.LATEX]: { | ||||
| 		extensions: [FileExtensionText.TEX], | ||||
| 		mimeTypes: [MimeTypeText.LATEX] | ||||
| 	}, | ||||
| 	[FileTypeText.BIBTEX]: { | ||||
| 		extensions: [FileExtensionText.BIB], | ||||
| 		mimeTypes: [MimeTypeText.BIBTEX] | ||||
| 	} | ||||
| } as const; | ||||
|   | ||||
| @@ -59,7 +59,9 @@ export enum FileTypeText { | ||||
| 	SWIFT = 'swift', | ||||
| 	DART = 'dart', | ||||
| 	VUE = 'vue', | ||||
| 	SVELTE = 'svelte' | ||||
| 	SVELTE = 'svelte', | ||||
| 	LATEX = 'latex', | ||||
| 	BIBTEX = 'bibtex' | ||||
| } | ||||
|  | ||||
| // File extension enums | ||||
| @@ -115,7 +117,9 @@ export enum FileExtensionText { | ||||
| 	SWIFT = '.swift', | ||||
| 	DART = '.dart', | ||||
| 	VUE = '.vue', | ||||
| 	SVELTE = '.svelte' | ||||
| 	SVELTE = '.svelte', | ||||
| 	TEX = '.tex', | ||||
| 	BIB = '.bib' | ||||
| } | ||||
|  | ||||
| // MIME type enums | ||||
| @@ -174,5 +178,7 @@ export enum MimeTypeText { | ||||
| 	SWIFT = 'text/x-swift', | ||||
| 	DART = 'text/x-dart', | ||||
| 	VUE = 'text/x-vue', | ||||
| 	SVELTE = 'text/x-svelte' | ||||
| 	SVELTE = 'text/x-svelte', | ||||
| 	LATEX = 'text/x-tex', | ||||
| 	BIBTEX = 'text/x-bibtex' | ||||
| } | ||||
|   | ||||
| @@ -3,6 +3,10 @@ | ||||
|  * Handles text file detection, reading, and validation | ||||
|  */ | ||||
|  | ||||
| import { | ||||
| 	DEFAULT_BINARY_DETECTION_OPTIONS, | ||||
| 	type BinaryDetectionOptions | ||||
| } from '$lib/constants/binary-detection'; | ||||
| import { FileExtensionText } from '$lib/enums/files'; | ||||
|  | ||||
| /** | ||||
| @@ -43,41 +47,51 @@ export async function readFileAsText(file: File): Promise<string> { | ||||
|  * Heuristic check to determine if content is likely from a text file | ||||
|  * Detects binary files by counting suspicious characters and null bytes | ||||
|  * @param content - The file content to analyze | ||||
|  * @param options - Optional configuration for detection parameters | ||||
|  * @returns True if the content appears to be text-based | ||||
|  */ | ||||
| export function isLikelyTextFile(content: string): boolean { | ||||
| export function isLikelyTextFile( | ||||
| 	content: string, | ||||
| 	options: Partial<BinaryDetectionOptions> = {} | ||||
| ): boolean { | ||||
| 	if (!content) return true; | ||||
|  | ||||
| 	const sample = content.substring(0, 1000); | ||||
| 	const config = { ...DEFAULT_BINARY_DETECTION_OPTIONS, ...options }; | ||||
| 	const sample = content.substring(0, config.prefixLength); | ||||
|  | ||||
| 	let suspiciousCount = 0; | ||||
| 	let nullCount = 0; | ||||
| 	let suspiciousControlCount = 0; | ||||
|  | ||||
| 	for (let i = 0; i < sample.length; i++) { | ||||
| 		const charCode = sample.charCodeAt(i); | ||||
|  | ||||
| 		// Count null bytes | ||||
| 		// Count null bytes - these are strong indicators of binary files | ||||
| 		if (charCode === 0) { | ||||
| 			nullCount++; | ||||
| 			suspiciousCount++; | ||||
|  | ||||
| 			continue; | ||||
| 		} | ||||
|  | ||||
| 		// Count suspicious control characters (excluding common ones like tab, newline, carriage return) | ||||
| 		// Count suspicious control characters | ||||
| 		// Allow common whitespace characters: tab (9), newline (10), carriage return (13) | ||||
| 		if (charCode < 32 && charCode !== 9 && charCode !== 10 && charCode !== 13) { | ||||
| 			suspiciousCount++; | ||||
| 			// Count most suspicious control characters | ||||
| 			if (charCode < 8 || (charCode > 13 && charCode < 27)) { | ||||
| 				suspiciousControlCount++; | ||||
| 			} | ||||
| 		} | ||||
|  | ||||
| 		// Count replacement characters (indicates encoding issues) | ||||
| 		if (charCode === 0xfffd) { | ||||
| 			suspiciousCount++; | ||||
| 			suspiciousControlCount++; | ||||
| 		} | ||||
| 	} | ||||
|  | ||||
| 	// Reject if too many null bytes or suspicious characters | ||||
| 	if (nullCount > 2) return false; | ||||
| 	if (suspiciousCount / sample.length > 0.1) return false; | ||||
| 	// Reject if too many null bytes | ||||
| 	if (nullCount > config.maxAbsoluteNullBytes) return false; | ||||
|  | ||||
| 	// Reject if too many suspicious characters | ||||
| 	if (suspiciousControlCount / sample.length > config.suspiciousCharThresholdRatio) return false; | ||||
|  | ||||
| 	return true; | ||||
| } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user