12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485 |
- package utils
- import (
- "bytes"
- "fmt"
- "github.com/saintfish/chardet"
- "golang.org/x/text/encoding"
- "golang.org/x/text/encoding/charmap"
- "golang.org/x/text/encoding/japanese"
- "golang.org/x/text/encoding/korean"
- "golang.org/x/text/encoding/simplifiedchinese"
- "golang.org/x/text/encoding/traditionalchinese"
- "golang.org/x/text/encoding/unicode"
- "golang.org/x/text/transform"
- "io"
- "strings"
- )
- // TransCharset 自动检测编码并转换为 UTF-8
- func TransCharset(s string) string {
- // 1. 自动检测编码
- detector := chardet.NewTextDetector()
- result, err := detector.DetectBest([]byte(s))
- if err != nil {
- fmt.Println("Encoding detection failed:", err)
- return s
- }
- // 2. 找到相应的编码
- fmt.Println("result.Charset:", result.Charset)
- enc := getEncoding(result.Charset)
- fmt.Println("enc:", enc)
- if enc == nil {
- // 直接返回原始字符串
- fmt.Println("Unsupported charset:", result.Charset)
- return s
- }
- // 3. 转换为 UTF-8
- rd := transform.NewReader(bytes.NewReader([]byte(s)), enc.NewDecoder())
- utf8Bytes, err := io.ReadAll(rd)
- if err != nil {
- fmt.Println("Encoding conversion failed:", err)
- return s
- }
- // 4. 返回转换后的 UTF-8 字符串
- return string(utf8Bytes)
- }
- // 根据字符集名称获取 `encoding.Encoding`
- func getEncoding(charset string) encoding.Encoding {
- switch charset {
- case "UTF-8", "ASCII":
- return encoding.Nop // 无需转换
- case "ISO-8859-1":
- return charmap.ISO8859_1
- case "ISO-8859-2":
- return charmap.ISO8859_2
- case "ISO-8859-15":
- return charmap.ISO8859_15
- case "Windows-1252":
- return charmap.Windows1252
- case "Big5":
- return traditionalchinese.Big5
- case "GB-2312", "GBK", "GB-18030":
- return simplifiedchinese.GBK
- case "Shift_JIS":
- return japanese.ShiftJIS
- case "EUC-JP":
- return japanese.EUCJP
- case "EUC-KR":
- return korean.EUCKR
- case "UTF-16LE":
- return unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM)
- case "UTF-16BE":
- return unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM)
- default:
- return nil
- }
- }
- func Trim(s string) string {
- return strings.Trim(s, " \r\n\t")
- }
|