automatically detect file encoding when importing delimiter-separated values (DSV) file

This commit is contained in:
MaysWind
2025-12-03 23:56:13 +08:00
parent 81226c3bb2
commit e143c8f098
23 changed files with 215 additions and 37 deletions
+58
View File
@@ -1,5 +1,9 @@
import chardet, { type Match } from 'chardet';
import type { ImportFileTypeAndExtensions } from '@/core/file.ts';
import { UTF_8, CHARDET_ENCODING_NAME_MAPPING } from '@/consts/file.ts';
import { isString } from './common.ts';
export function getFileExtension(filename: string): string {
@@ -41,3 +45,57 @@ export function isFileExtensionSupported(filename: string, supportedExtensions:
return false;
}
export function detectFileEncoding(file: File): Promise<string> {
return new Promise((resolve, reject) => {
const reader = new FileReader();
reader.onload = () => {
const arrayBuffer = reader.result as ArrayBuffer;
const uint8Array = new Uint8Array(arrayBuffer);
const possibleEncodings: Match[] = chardet.analyse(uint8Array);
if (!possibleEncodings || possibleEncodings.length < 1) {
reject(new Error('unable to detect file encoding'));
return;
}
const mostPossibleEncoding: Match = possibleEncodings[0] as Match;
if (!mostPossibleEncoding.name || mostPossibleEncoding.confidence < 50) {
// check whether all characters are ASCII
let isAllAscii = true;
for (const byte of uint8Array) {
if (byte > 0x7F) {
isAllAscii = false;
break;
}
}
if (isAllAscii) {
resolve(UTF_8);
return;
}
reject(new Error('unable to detect file encoding'));
return;
}
const encoding = CHARDET_ENCODING_NAME_MAPPING[mostPossibleEncoding.name];
if (!encoding) {
reject(new Error(`unsupported file encoding: ${mostPossibleEncoding.name}`));
return;
}
resolve(encoding);
};
reader.onerror = () => {
reject(new Error('failed to read file for encoding detection'));
};
reader.readAsArrayBuffer(file);
});
}