automatically detect file encoding when importing delimiter-separated values (DSV) file

This commit is contained in:
MaysWind
2025-12-03 23:56:13 +08:00
parent 81226c3bb2
commit e143c8f098
23 changed files with 215 additions and 37 deletions
+34 -1
View File
@@ -9,8 +9,10 @@ export const SUPPORTED_DOCUMENT_LANGUAGES_FOR_IMPORT_FILE: Record<string, string
'zh-Hant': 'zh-Hans',
};
export const UTF_8 = 'utf-8';
export const SUPPORTED_FILE_ENCODINGS: string[] = [
'utf-8', // UTF-8
UTF_8, // UTF-8
'utf-8-bom', // UTF-8 with BOM
'utf-16le', // UTF-16 Little Endian
'utf-16be', // UTF-16 Big Endian
@@ -64,6 +66,37 @@ export const SUPPORTED_FILE_ENCODINGS: string[] = [
'shift_jis', // Japanese (Shift_JIS)
];
export const CHARDET_ENCODING_NAME_MAPPING: Record<string, string> = {
'UTF-8': UTF_8,
'UTF-16LE': 'utf-16le',
'UTF-16BE': 'utf-16be',
// 'UTF-32 LE': '', // not supported
// 'UTF-32 BE': '', // not supported
'ISO-2022-JP': 'iso-2022-jp',
// 'ISO-2022-KR': '', // not supported
// 'ISO-2022-CN': '', // not supported
'Shift_JIS': 'shift_jis',
'Big5': 'big5',
'EUC-JP': 'euc-jp',
'EUC-KR': 'euc-kr',
'GB18030': 'gb18030',
'ISO-8859-1': 'iso-8859-1',
'ISO-8859-2': 'iso-8859-2',
'ISO-8859-5': 'iso-8859-5',
'ISO-8859-6': 'iso-8859-6',
'ISO-8859-7': 'iso-8859-7',
'ISO-8859-8': 'iso-8859-8',
'ISO-8859-9': 'iso-8859-9',
'windows-1250': 'windows-1250',
'windows-1251': 'windows-1251',
'windows-1252': 'windows-1252',
'windows-1253': 'windows-1253',
'windows-1254': 'windows-1254',
'windows-1255': 'windows-1255',
'windows-1256': 'windows-1256',
'KOI8-R':'koi8r'
};
export const SUPPORTED_IMPORT_FILE_CATEGORY_AND_TYPES: ImportFileCategoryAndTypes[] = [
{
categoryName: 'ezBookkeeping File Format',
+58
View File
@@ -1,5 +1,9 @@
import chardet, { type Match } from 'chardet';
import type { ImportFileTypeAndExtensions } from '@/core/file.ts';
import { UTF_8, CHARDET_ENCODING_NAME_MAPPING } from '@/consts/file.ts';
import { isString } from './common.ts';
export function getFileExtension(filename: string): string {
@@ -41,3 +45,57 @@ export function isFileExtensionSupported(filename: string, supportedExtensions:
return false;
}
export function detectFileEncoding(file: File): Promise<string> {
return new Promise((resolve, reject) => {
const reader = new FileReader();
reader.onload = () => {
const arrayBuffer = reader.result as ArrayBuffer;
const uint8Array = new Uint8Array(arrayBuffer);
const possibleEncodings: Match[] = chardet.analyse(uint8Array);
if (!possibleEncodings || possibleEncodings.length < 1) {
reject(new Error('unable to detect file encoding'));
return;
}
const mostPossibleEncoding: Match = possibleEncodings[0] as Match;
if (!mostPossibleEncoding.name || mostPossibleEncoding.confidence < 50) {
// check whether all characters are ASCII
let isAllAscii = true;
for (const byte of uint8Array) {
if (byte > 0x7F) {
isAllAscii = false;
break;
}
}
if (isAllAscii) {
resolve(UTF_8);
return;
}
reject(new Error('unable to detect file encoding'));
return;
}
const encoding = CHARDET_ENCODING_NAME_MAPPING[mostPossibleEncoding.name];
if (!encoding) {
reject(new Error(`unsupported file encoding: ${mostPossibleEncoding.name}`));
return;
}
resolve(encoding);
};
reader.onerror = () => {
reject(new Error('failed to read file for encoding detection'));
};
reader.readAsArrayBuffer(file);
});
}
+2
View File
@@ -1443,6 +1443,7 @@
"No results": "Keine Ergebnisse",
"Unknown": "Unbekannt",
"Auto detect": "Auto detect",
"Detecting...": "Detecting...",
"Miscellaneous": "Verschiedenes",
"Default": "Standard",
"Included": "Included",
@@ -1896,6 +1897,7 @@
"Data File": "Datendatei",
"Data to import": "Data to import",
"Please select a file to import": "Bitte wählen Sie eine Datei zum Importieren aus",
"Unable to detect the file encoding automatically. Please select the actual encoding.": "Unable to detect the file encoding automatically. Please select the actual encoding.",
"Include Header Line": "Include Header Line",
"Time Format": "Time Format",
"Transaction Type Mapping": "Transaction Type Mapping",
+2
View File
@@ -1443,6 +1443,7 @@
"No results": "No results",
"Unknown": "Unknown",
"Auto detect": "Auto detect",
"Detecting...": "Detecting...",
"Miscellaneous": "Miscellaneous",
"Default": "Default",
"Included": "Included",
@@ -1896,6 +1897,7 @@
"Data File": "Data File",
"Data to import": "Data to import",
"Please select a file to import": "Please select a file to import",
"Unable to detect the file encoding automatically. Please select the actual encoding.": "Unable to detect the file encoding automatically. Please select the actual encoding.",
"Include Header Line": "Include Header Line",
"Time Format": "Time Format",
"Transaction Type Mapping": "Transaction Type Mapping",
+2
View File
@@ -1443,6 +1443,7 @@
"No results": "Sin resultados",
"Unknown": "Desconocido",
"Auto detect": "Auto detect",
"Detecting...": "Detecting...",
"Miscellaneous": "Misceláneas",
"Default": "Por defecto",
"Included": "Included",
@@ -1896,6 +1897,7 @@
"Data File": "Archivo de datos",
"Data to import": "Data to import",
"Please select a file to import": "Please select a file to import",
"Unable to detect the file encoding automatically. Please select the actual encoding.": "Unable to detect the file encoding automatically. Please select the actual encoding.",
"Include Header Line": "Include Header Line",
"Time Format": "Time Format",
"Transaction Type Mapping": "Transaction Type Mapping",
+2
View File
@@ -1443,6 +1443,7 @@
"No results": "Aucun résultat",
"Unknown": "Inconnu",
"Auto detect": "Détection automatique",
"Detecting...": "Detecting...",
"Miscellaneous": "Divers",
"Default": "Par défaut",
"Included": "Included",
@@ -1896,6 +1897,7 @@
"Data File": "Fichier de données",
"Data to import": "Données à importer",
"Please select a file to import": "Veuillez sélectionner un fichier à importer",
"Unable to detect the file encoding automatically. Please select the actual encoding.": "Unable to detect the file encoding automatically. Please select the actual encoding.",
"Include Header Line": "Inclure la ligne d'en-tête",
"Time Format": "Format d'heure",
"Transaction Type Mapping": "Mappage du type de transaction",
+5
View File
@@ -2140,6 +2140,10 @@ export function useI18n() {
return ret;
}
function getLocalizedFileEncodingName(encoding: string): string {
return t(`encoding.${encoding}`);
}
function getLocalizedOAuth2ProviderName(oauth2Provider: string, oidcDisplayNames: Record<string, string>): string {
if (oauth2Provider === 'oidc') {
const providerDisplayName = getServerMultiLanguageConfigContent(oidcDisplayNames);
@@ -2452,6 +2456,7 @@ export function useI18n() {
getAmountPrependAndAppendText,
getCategorizedAccountsWithDisplayBalance,
// other format functions
getLocalizedFileEncodingName,
getLocalizedOAuth2ProviderName,
getLocalizedOAuth2LoginText,
// localization setting functions
+2
View File
@@ -1443,6 +1443,7 @@
"No results": "Nessun risultato",
"Unknown": "Sconosciuto",
"Auto detect": "Rilevamento automatico",
"Detecting...": "Detecting...",
"Miscellaneous": "Varie",
"Default": "Predefinito",
"Included": "Included",
@@ -1896,6 +1897,7 @@
"Data File": "File dati",
"Data to import": "Dati da importare",
"Please select a file to import": "Seleziona un file da importare",
"Unable to detect the file encoding automatically. Please select the actual encoding.": "Unable to detect the file encoding automatically. Please select the actual encoding.",
"Include Header Line": "Includi riga di intestazione",
"Time Format": "Formato ora",
"Transaction Type Mapping": "Mappatura tipo transazione",
+2
View File
@@ -1443,6 +1443,7 @@
"No results": "結果はありません",
"Unknown": "不明",
"Auto detect": "自動検出",
"Detecting...": "Detecting...",
"Miscellaneous": "その他",
"Default": "デフォルト",
"Included": "Included",
@@ -1896,6 +1897,7 @@
"Data File": "データファイル",
"Data to import": "インポートするデータ",
"Please select a file to import": "インポートするファイルを選択してください",
"Unable to detect the file encoding automatically. Please select the actual encoding.": "Unable to detect the file encoding automatically. Please select the actual encoding.",
"Include Header Line": "ヘッダー行を含める",
"Time Format": "時刻形式",
"Transaction Type Mapping": "取引タイプのマッピング",
+2
View File
@@ -1443,6 +1443,7 @@
"No results": "결과 없음",
"Unknown": "알 수 없음",
"Auto detect": "자동 감지",
"Detecting...": "Detecting...",
"Miscellaneous": "기타",
"Default": "기본값",
"Included": "Included",
@@ -1896,6 +1897,7 @@
"Data File": "데이터 파일",
"Data to import": "가져올 데이터",
"Please select a file to import": "가져올 파일을 선택하십시오",
"Unable to detect the file encoding automatically. Please select the actual encoding.": "Unable to detect the file encoding automatically. Please select the actual encoding.",
"Include Header Line": "헤더 행 포함",
"Time Format": "시간 형식",
"Transaction Type Mapping": "거래 유형 매핑",
+2
View File
@@ -1443,6 +1443,7 @@
"No results": "Geen resultaten",
"Unknown": "Onbekend",
"Auto detect": "Automatisch detecteren",
"Detecting...": "Detecting...",
"Miscellaneous": "Diversen",
"Default": "Standaard",
"Included": "Included",
@@ -1896,6 +1897,7 @@
"Data File": "Gegevensbestand",
"Data to import": "Te importeren gegevens",
"Please select a file to import": "Selecteer een bestand om te importeren",
"Unable to detect the file encoding automatically. Please select the actual encoding.": "Unable to detect the file encoding automatically. Please select the actual encoding.",
"Include Header Line": "Kopregel opnemen",
"Time Format": "Tijdsformaat",
"Transaction Type Mapping": "Transactietypetoewijzing",
+2
View File
@@ -1443,6 +1443,7 @@
"No results": "Sem resultados",
"Unknown": "Desconhecido",
"Auto detect": "Detecção automática",
"Detecting...": "Detecting...",
"Miscellaneous": "Diversos",
"Default": "Padrão",
"Included": "Included",
@@ -1896,6 +1897,7 @@
"Data File": "Arquivo de Dados",
"Data to import": "Dados para importar",
"Please select a file to import": "Por favor, selecione um arquivo para importar",
"Unable to detect the file encoding automatically. Please select the actual encoding.": "Unable to detect the file encoding automatically. Please select the actual encoding.",
"Include Header Line": "Incluir Linha de Cabeçalho",
"Time Format": "Formato de Tempo",
"Transaction Type Mapping": "Mapeamento de Tipo de Transação",
+2
View File
@@ -1443,6 +1443,7 @@
"No results": "Нет результатов",
"Unknown": "Неизвестно",
"Auto detect": "Auto detect",
"Detecting...": "Detecting...",
"Miscellaneous": "Разное",
"Default": "По умолчанию",
"Included": "Included",
@@ -1896,6 +1897,7 @@
"Data File": "Файл данных",
"Data to import": "Data to import",
"Please select a file to import": "Please select a file to import",
"Unable to detect the file encoding automatically. Please select the actual encoding.": "Unable to detect the file encoding automatically. Please select the actual encoding.",
"Include Header Line": "Include Header Line",
"Time Format": "Time Format",
"Transaction Type Mapping": "Transaction Type Mapping",
+2
View File
@@ -1443,6 +1443,7 @@
"No results": "ไม่มีผลลัพธ์",
"Unknown": "ไม่ทราบ",
"Auto detect": "ตรวจสอบอัตโนมัติ",
"Detecting...": "Detecting...",
"Miscellaneous": "อื่น ๆ",
"Default": "ค่าเริ่มต้น",
"Included": "Included",
@@ -1896,6 +1897,7 @@
"Data File": "ไฟล์ข้อมูล",
"Data to import": "ข้อมูลที่จะนำเข้า",
"Please select a file to import": "กรุณาเลือกไฟล์เพื่อนำเข้า",
"Unable to detect the file encoding automatically. Please select the actual encoding.": "Unable to detect the file encoding automatically. Please select the actual encoding.",
"Include Header Line": "รวมแถวหัวตาราง",
"Time Format": "รูปแบบเวลา",
"Transaction Type Mapping": "การแมปประเภทรายการ",
+2
View File
@@ -1443,6 +1443,7 @@
"No results": "Немає результатів",
"Unknown": "Невідомо",
"Auto detect": "Автовизначення",
"Detecting...": "Detecting...",
"Miscellaneous": "Різне",
"Default": "По замовчуванню",
"Included": "Included",
@@ -1896,6 +1897,7 @@
"Data File": "Файл даних",
"Data to import": "Дані для імпорту",
"Please select a file to import": "Будь ласка, виберіть файл для імпорту",
"Unable to detect the file encoding automatically. Please select the actual encoding.": "Unable to detect the file encoding automatically. Please select the actual encoding.",
"Include Header Line": "Включити рядок заголовка",
"Time Format": "Формат часу",
"Transaction Type Mapping": "Відповідність типів транзакцій",
+2
View File
@@ -1443,6 +1443,7 @@
"No results": "Không có kết quả",
"Unknown": "Không rõ",
"Auto detect": "Auto detect",
"Detecting...": "Detecting...",
"Miscellaneous": "Linh tinh",
"Default": "Mặc định",
"Included": "Included",
@@ -1896,6 +1897,7 @@
"Data File": "Tệp dữ liệu",
"Data to import": "Data to import",
"Please select a file to import": "Please select a file to import",
"Unable to detect the file encoding automatically. Please select the actual encoding.": "Unable to detect the file encoding automatically. Please select the actual encoding.",
"Include Header Line": "Include Header Line",
"Time Format": "Time Format",
"Transaction Type Mapping": "Transaction Type Mapping",
+2
View File
@@ -1443,6 +1443,7 @@
"No results": "无结果",
"Unknown": "未知",
"Auto detect": "自动检测",
"Detecting...": "正在检测...",
"Miscellaneous": "杂项",
"Default": "默认",
"Included": "包含",
@@ -1896,6 +1897,7 @@
"Data File": "数据文件",
"Data to import": "要导入的数据",
"Please select a file to import": "请选择要导入的文件",
"Unable to detect the file encoding automatically. Please select the actual encoding.": "无法自动检测文件编码。请选择实际的编码。",
"Include Header Line": "包含标题行",
"Time Format": "时间格式",
"Amount Format": "金额格式",
+2
View File
@@ -1443,6 +1443,7 @@
"No results": "無結果",
"Unknown": "未知",
"Auto detect": "自動偵測",
"Detecting...": "正在偵測...",
"Miscellaneous": "雜項",
"Default": "預設",
"Included": "包含",
@@ -1896,6 +1897,7 @@
"Data File": "資料檔案",
"Data to import": "要匯入的資料",
"Please select a file to import": "請選擇要匯入的檔案",
"Unable to detect the file encoding automatically. Please select the actual encoding.": "無法自動偵測檔案編碼。請選擇實際的編碼。",
"Include Header Line": "包含標頭列",
"Time Format": "時間格式",
"Amount Format": "金額格式",
@@ -237,7 +237,7 @@
:prepend-icon="mdiClose" @click="close(false)"
v-if="currentStep !== 'finalResult'">{{ tt('Cancel') }}</v-btn>
<v-btn class="button-icon-with-direction" color="primary"
:disabled="loading || submitting || (!isImportDataFromTextbox && !importFile) || (isImportDataFromTextbox && !importData)"
:disabled="loading || submitting || (!isImportDataFromTextbox && !importFile) || (isImportDataFromTextbox && !importData) || (!isImportDataFromTextbox && allSupportedEncodings && fileEncoding === 'auto' && !autoDetectedFileEncoding)"
:append-icon="!submitting ? mdiArrowRight : undefined" @click="parseData"
v-if="currentStep === 'defineColumn' || currentStep === 'executeCustomScript' || currentStep === 'uploadFile'">
{{ tt('Next') }}
@@ -293,10 +293,12 @@ import {
type LocalizedImportFileTypeSupportedEncodings,
KnownFileType
} from '@/core/file.ts';
import { UTF_8 } from '@/consts/file.ts';
import { ImportTransaction } from '@/models/imported_transaction.ts';
import { isDefined, isNumber } from '@/lib/common.ts';
import { findExtensionByType, isFileExtensionSupported } from '@/lib/file.ts';
import { findExtensionByType, isFileExtensionSupported, detectFileEncoding } from '@/lib/file.ts';
import { generateRandomUUID } from '@/lib/misc.ts';
import logger from '@/lib/logger.ts';
@@ -330,7 +332,8 @@ const {
joinMultiText,
getCurrentNumeralSystemType,
getAllSupportedImportFileCagtegoryAndTypes,
formatNumberToLocalizedNumerals
formatNumberToLocalizedNumerals,
getLocalizedFileEncodingName
} = useI18n();
const accountsStore = useAccountsStore();
@@ -377,7 +380,9 @@ const currentStep = ref<ImportTransactionDialogStep>('uploadFile');
const importProcess = ref<number>(0);
const fileType = ref<string>('ezbookkeeping');
const fileSubType = ref<string>('ezbookkeeping_csv');
const fileEncoding = ref<string>('utf-8');
const fileEncoding = ref<string>('auto');
const detectingFileEncoding = ref<boolean>(false);
const autoDetectedFileEncoding = ref<string | undefined>(undefined);
const processDSVMethod = ref<ImportDSVProcessMethod>(ImportDSVProcessMethod.ColumnMapping);
const importFile = ref<File | null>(null);
const importData = ref<string>('');
@@ -396,7 +401,39 @@ const numeralSystem = computed<NumeralSystem>(() => getCurrentNumeralSystemType(
const allSupportedImportFileCategoryAndTypes = computed<LocalizedImportFileCategoryAndTypes[]>(() => getAllSupportedImportFileCagtegoryAndTypes());
const allFileSubTypes = computed<LocalizedImportFileTypeSubType[] | undefined>(() => allSupportedImportFileTypesMap.value[fileType.value]?.subTypes);
const allSupportedEncodings = computed<LocalizedImportFileTypeSupportedEncodings[] | undefined>(() => allSupportedImportFileTypesMap.value[fileType.value]?.supportedEncodings);
const allSupportedEncodings = computed<LocalizedImportFileTypeSupportedEncodings[] | undefined>(() => {
const supportedEncodings = allSupportedImportFileTypesMap.value[fileType.value]?.supportedEncodings;
if (!supportedEncodings) {
return undefined;
}
const ret: LocalizedImportFileTypeSupportedEncodings[] = [];
let autoDetectDisplayName = tt('Auto detect');
if (importFile.value) {
if (detectingFileEncoding.value) {
autoDetectDisplayName += ` [${tt('Detecting...')}]`;
} else if (autoDetectedFileEncoding.value) {
autoDetectDisplayName += ` [${getLocalizedFileEncodingName(autoDetectedFileEncoding.value)}]`;
} else {
autoDetectDisplayName += ` [${tt('Unknown')}]`;
}
}
const autoDetectEncoding: LocalizedImportFileTypeSupportedEncodings = {
displayName: autoDetectDisplayName,
encoding: 'auto'
};
ret.push(autoDetectEncoding);
if (supportedEncodings && supportedEncodings.length) {
ret.push(...supportedEncodings);
}
return ret;
});
const isImportDataFromTextbox = computed<boolean>(() => allSupportedImportFileTypesMap.value[fileType.value]?.dataFromTextbox ?? false);
const supportedAdditionalOptions = computed<ImportFileTypeSupportedAdditionalOptions | undefined>(() => allSupportedImportFileTypesMap.value[fileType.value]?.supportedAdditionalOptions);
@@ -508,7 +545,9 @@ function getDisplayCount(count: number): string {
function open(): Promise<void> {
fileType.value = 'ezbookkeeping';
fileSubType.value = 'ezbookkeeping_csv';
fileEncoding.value = 'utf-8';
fileEncoding.value = 'auto';
detectingFileEncoding.value = false;
autoDetectedFileEncoding.value = undefined;
processDSVMethod.value = ImportDSVProcessMethod.ColumnMapping;
currentStep.value = 'uploadFile';
importProcess.value = 0;
@@ -570,7 +609,21 @@ function setImportFile(event: Event): void {
}
importFile.value = el.files[0] as File;
detectingFileEncoding.value = false;
autoDetectedFileEncoding.value = undefined;
el.value = '';
if (allSupportedEncodings.value) {
detectingFileEncoding.value = true;
detectFileEncoding(importFile.value).then(detectedEncoding => {
detectingFileEncoding.value = false;
autoDetectedFileEncoding.value = detectedEncoding;
}).catch(() => {
detectingFileEncoding.value = false;
autoDetectedFileEncoding.value = undefined;
});
}
}
function parseData(): void {
@@ -583,7 +636,11 @@ function parseData(): void {
}
if (allSupportedEncodings.value) {
encoding = fileEncoding.value;
if (fileEncoding.value === 'auto') {
encoding = autoDetectedFileEncoding.value;
} else {
encoding = fileEncoding.value;
}
}
if (!isImportDataFromTextbox.value) {
@@ -592,6 +649,13 @@ function parseData(): void {
return;
}
if (allSupportedEncodings.value) {
if (fileEncoding.value === 'auto' && !autoDetectedFileEncoding.value) {
snackbar.value?.showError('Unable to detect the file encoding automatically. Please select the actual encoding.');
return;
}
}
uploadFile = importFile.value;
} else if (isImportDataFromTextbox.value) {
if (!importData.value) {
@@ -608,7 +672,7 @@ function parseData(): void {
return;
}
encoding = 'utf-8';
encoding = UTF_8;
} else { // should not happen, but ts would check whether uploadFile has been assigned a value
snackbar.value?.showMessage('An error occurred');
return;