charset.go 2.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485
  1. package utils
  2. import (
  3. "bytes"
  4. "fmt"
  5. "github.com/saintfish/chardet"
  6. "golang.org/x/text/encoding"
  7. "golang.org/x/text/encoding/charmap"
  8. "golang.org/x/text/encoding/japanese"
  9. "golang.org/x/text/encoding/korean"
  10. "golang.org/x/text/encoding/simplifiedchinese"
  11. "golang.org/x/text/encoding/traditionalchinese"
  12. "golang.org/x/text/encoding/unicode"
  13. "golang.org/x/text/transform"
  14. "io"
  15. "strings"
  16. )
  17. // TransCharset 自动检测编码并转换为 UTF-8
  18. func TransCharset(s string) string {
  19. // 1. 自动检测编码
  20. detector := chardet.NewTextDetector()
  21. result, err := detector.DetectBest([]byte(s))
  22. if err != nil {
  23. fmt.Println("Encoding detection failed:", err)
  24. return s
  25. }
  26. // 2. 找到相应的编码
  27. fmt.Println("result.Charset:", result.Charset)
  28. enc := getEncoding(result.Charset)
  29. fmt.Println("enc:", enc)
  30. if enc == nil {
  31. // 直接返回原始字符串
  32. fmt.Println("Unsupported charset:", result.Charset)
  33. return s
  34. }
  35. // 3. 转换为 UTF-8
  36. rd := transform.NewReader(bytes.NewReader([]byte(s)), enc.NewDecoder())
  37. utf8Bytes, err := io.ReadAll(rd)
  38. if err != nil {
  39. fmt.Println("Encoding conversion failed:", err)
  40. return s
  41. }
  42. // 4. 返回转换后的 UTF-8 字符串
  43. return string(utf8Bytes)
  44. }
  45. // 根据字符集名称获取 `encoding.Encoding`
  46. func getEncoding(charset string) encoding.Encoding {
  47. switch charset {
  48. case "UTF-8", "ASCII":
  49. return encoding.Nop // 无需转换
  50. case "ISO-8859-1":
  51. return charmap.ISO8859_1
  52. case "ISO-8859-2":
  53. return charmap.ISO8859_2
  54. case "ISO-8859-15":
  55. return charmap.ISO8859_15
  56. case "Windows-1252":
  57. return charmap.Windows1252
  58. case "Big5":
  59. return traditionalchinese.Big5
  60. case "GB-2312", "GBK", "GB-18030":
  61. return simplifiedchinese.GBK
  62. case "Shift_JIS":
  63. return japanese.ShiftJIS
  64. case "EUC-JP":
  65. return japanese.EUCJP
  66. case "EUC-KR":
  67. return korean.EUCKR
  68. case "UTF-16LE":
  69. return unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM)
  70. case "UTF-16BE":
  71. return unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM)
  72. default:
  73. return nil
  74. }
  75. }
  76. func Trim(s string) string {
  77. return strings.Trim(s, " \r\n\t")
  78. }