merge UTF-8 and UTF-16 encodings with or without BOM, with BOM automatically detected and handled

This commit is contained in:
MaysWind
2026-03-01 15:27:03 +08:00
parent 8192a48bc5
commit 9d275a3051
21 changed files with 49 additions and 112 deletions
@@ -33,58 +33,55 @@ var supportedFileTypeSeparators = map[string]rune{
}
var supportedFileEncodings = map[string]encoding.Encoding{
"utf-8": unicode.UTF8, // UTF-8
"utf-8-bom": unicode.UTF8BOM, // UTF-8 with BOM
"utf-16le": unicode.UTF16(unicode.LittleEndian, unicode.UseBOM), // UTF-16 Little Endian
"utf-16be": unicode.UTF16(unicode.BigEndian, unicode.UseBOM), // UTF-16 Big Endian
"utf-16le-bom": unicode.UTF16(unicode.LittleEndian, unicode.ExpectBOM), // UTF-16 Little Endian with BOM
"utf-16be-bom": unicode.UTF16(unicode.BigEndian, unicode.ExpectBOM), // UTF-16 Big Endian with BOM
"cp437": charmap.CodePage437, // OEM United States (CP-437)
"cp863": charmap.CodePage863, // OEM Canadian French (CP-863)
"cp037": charmap.CodePage037, // IBM EBCDIC US/Canada (CP-037)
"cp1047": charmap.CodePage1047, // IBM EBCDIC Open Systems (CP-1047)
"cp1140": charmap.CodePage1140, // IBM EBCDIC US/Canada with Euro (CP-1140)
"iso-8859-1": charmap.ISO8859_1, // Western European (ISO-8859-1)
"cp850": charmap.CodePage850, // Western European (CP-850)
"cp858": charmap.CodePage858, // Western European with Euro (CP-858)
"windows-1252": charmap.Windows1252, // Western European (Windows-1252)
"iso-8859-15": charmap.ISO8859_15, // Western European (ISO-8859-15)
"iso-8859-4": charmap.ISO8859_4, // North European (ISO-8859-4)
"iso-8859-10": charmap.ISO8859_10, // North European (ISO-8859-10)
"cp865": charmap.CodePage865, // North European (CP-865)
"iso-8859-2": charmap.ISO8859_2, // Central European (ISO-8859-2)
"cp852": charmap.CodePage852, // Central European (CP-852)
"windows-1250": charmap.Windows1250, // Central European (Windows-1250)
"iso-8859-14": charmap.ISO8859_14, // Celtic (ISO-8859-14)
"iso-8859-3": charmap.ISO8859_3, // South European (ISO-8859-3)
"cp860": charmap.CodePage860, // Portuguese (CP-860)
"iso-8859-7": charmap.ISO8859_7, // Greek (ISO-8859-7)
"windows-1253": charmap.Windows1253, // Greek (Windows-1253)
"iso-8859-9": charmap.ISO8859_9, // Turkish (ISO-8859-9)
"windows-1254": charmap.Windows1254, // Turkish (Windows-1254)
"iso-8859-13": charmap.ISO8859_13, // Baltic (ISO-8859-13)
"windows-1257": charmap.Windows1257, // Baltic (Windows-1257)
"iso-8859-16": charmap.ISO8859_16, // South-Eastern European (ISO-8859-16)
"iso-8859-5": charmap.ISO8859_5, // Cyrillic (ISO-8859-5)
"cp855": charmap.CodePage855, // Cyrillic (CP-855)
"cp866": charmap.CodePage866, // Cyrillic (CP-866)
"windows-1251": charmap.Windows1251, // Cyrillic (Windows-1251)
"koi8r": charmap.KOI8R, // Cyrillic (KOI8-R)
"koi8u": charmap.KOI8U, // Cyrillic (KOI8-U)
"iso-8859-6": charmap.ISO8859_6, // Arabic (ISO-8859-6)
"windows-1256": charmap.Windows1256, // Arabic (Windows-1256)
"iso-8859-8": charmap.ISO8859_8, // Hebrew (ISO-8859-8)
"cp862": charmap.CodePage862, // Hebrew (CP-862)
"windows-1255": charmap.Windows1255, // Hebrew (Windows-1255)
"windows-874": charmap.Windows874, // Thai (Windows-874)
"windows-1258": charmap.Windows1258, // Vietnamese (Windows-1258)
"gb18030": simplifiedchinese.GB18030, // Chinese (Simplified, GB18030)
"gbk": simplifiedchinese.GBK, // Chinese (Simplified, GBK)
"big5": traditionalchinese.Big5, // Chinese (Traditional, Big5)
"euc-kr": korean.EUCKR, // Korean (EUC-KR)
"euc-jp": japanese.EUCJP, // Japanese (EUC-JP)
"iso-2022-jp": japanese.ISO2022JP, // Japanese (ISO-2022-JP)
"shift_jis": japanese.ShiftJIS, // Japanese (Shift JIS)
"utf-8": unicode.UTF8BOM, // UTF-8
"utf-16le": unicode.UTF16(unicode.LittleEndian, unicode.UseBOM), // UTF-16 Little Endian
"utf-16be": unicode.UTF16(unicode.BigEndian, unicode.UseBOM), // UTF-16 Big Endian
"cp437": charmap.CodePage437, // OEM United States (CP-437)
"cp863": charmap.CodePage863, // OEM Canadian French (CP-863)
"cp037": charmap.CodePage037, // IBM EBCDIC US/Canada (CP-037)
"cp1047": charmap.CodePage1047, // IBM EBCDIC Open Systems (CP-1047)
"cp1140": charmap.CodePage1140, // IBM EBCDIC US/Canada with Euro (CP-1140)
"iso-8859-1": charmap.ISO8859_1, // Western European (ISO-8859-1)
"cp850": charmap.CodePage850, // Western European (CP-850)
"cp858": charmap.CodePage858, // Western European with Euro (CP-858)
"windows-1252": charmap.Windows1252, // Western European (Windows-1252)
"iso-8859-15": charmap.ISO8859_15, // Western European (ISO-8859-15)
"iso-8859-4": charmap.ISO8859_4, // North European (ISO-8859-4)
"iso-8859-10": charmap.ISO8859_10, // North European (ISO-8859-10)
"cp865": charmap.CodePage865, // North European (CP-865)
"iso-8859-2": charmap.ISO8859_2, // Central European (ISO-8859-2)
"cp852": charmap.CodePage852, // Central European (CP-852)
"windows-1250": charmap.Windows1250, // Central European (Windows-1250)
"iso-8859-14": charmap.ISO8859_14, // Celtic (ISO-8859-14)
"iso-8859-3": charmap.ISO8859_3, // South European (ISO-8859-3)
"cp860": charmap.CodePage860, // Portuguese (CP-860)
"iso-8859-7": charmap.ISO8859_7, // Greek (ISO-8859-7)
"windows-1253": charmap.Windows1253, // Greek (Windows-1253)
"iso-8859-9": charmap.ISO8859_9, // Turkish (ISO-8859-9)
"windows-1254": charmap.Windows1254, // Turkish (Windows-1254)
"iso-8859-13": charmap.ISO8859_13, // Baltic (ISO-8859-13)
"windows-1257": charmap.Windows1257, // Baltic (Windows-1257)
"iso-8859-16": charmap.ISO8859_16, // South-Eastern European (ISO-8859-16)
"iso-8859-5": charmap.ISO8859_5, // Cyrillic (ISO-8859-5)
"cp855": charmap.CodePage855, // Cyrillic (CP-855)
"cp866": charmap.CodePage866, // Cyrillic (CP-866)
"windows-1251": charmap.Windows1251, // Cyrillic (Windows-1251)
"koi8r": charmap.KOI8R, // Cyrillic (KOI8-R)
"koi8u": charmap.KOI8U, // Cyrillic (KOI8-U)
"iso-8859-6": charmap.ISO8859_6, // Arabic (ISO-8859-6)
"windows-1256": charmap.Windows1256, // Arabic (Windows-1256)
"iso-8859-8": charmap.ISO8859_8, // Hebrew (ISO-8859-8)
"cp862": charmap.CodePage862, // Hebrew (CP-862)
"windows-1255": charmap.Windows1255, // Hebrew (Windows-1255)
"windows-874": charmap.Windows874, // Thai (Windows-874)
"windows-1258": charmap.Windows1258, // Vietnamese (Windows-1258)
"gb18030": simplifiedchinese.GB18030, // Chinese (Simplified, GB18030)
"gbk": simplifiedchinese.GBK, // Chinese (Simplified, GBK)
"big5": traditionalchinese.Big5, // Chinese (Traditional, Big5)
"euc-kr": korean.EUCKR, // Korean (EUC-KR)
"euc-jp": japanese.EUCJP, // Japanese (EUC-JP)
"iso-2022-jp": japanese.ISO2022JP, // Japanese (ISO-2022-JP)
"shift_jis": japanese.ShiftJIS, // Japanese (Shift JIS)
}
var customTransactionTypeNameMapping = map[models.TransactionType]string{