charset.go 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204
  1. // Package charset provides common text encodings for HTML documents.
  2. //
  3. // The mapping from encoding labels to encodings is defined at
  4. // http://encoding.spec.whatwg.org.
  5. package charset
  6. import (
  7. "bytes"
  8. "mime"
  9. "strings"
  10. "unicode/utf8"
  11. "code.google.com/p/go.net/html"
  12. "code.google.com/p/go.text/encoding"
  13. "code.google.com/p/go.text/encoding/charmap"
  14. )
  15. // Lookup returns the encoding with the specified label, and its canonical
  16. // name. It returns nil and the empty string if label is not one of the
  17. // standard encodings for HTML. Matching is case-insensitive and ignores
  18. // leading and trailing whitespace.
  19. func Lookup(label string) (e encoding.Encoding, name string) {
  20. label = strings.ToLower(strings.Trim(label, "\t\n\r\f "))
  21. enc := encodings[label]
  22. return enc.e, enc.name
  23. }
  24. // DetermineEncoding determines the encoding of an HTML document by examining
  25. // up to the first 1024 bytes of content and the declared Content-Type.
  26. //
  27. // See http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#determining-the-character-encoding
  28. func DetermineEncoding(content []byte, contentType string) (e encoding.Encoding, name string, certain bool) {
  29. if len(content) > 1024 {
  30. content = content[:1024]
  31. }
  32. for _, b := range boms {
  33. if bytes.HasPrefix(content, b.bom) {
  34. e, name = Lookup(b.enc)
  35. return e, name, true
  36. }
  37. }
  38. if _, params, err := mime.ParseMediaType(contentType); err == nil {
  39. if cs, ok := params["charset"]; ok {
  40. if e, name = Lookup(cs); e != nil {
  41. return e, name, true
  42. }
  43. }
  44. }
  45. if len(content) > 0 {
  46. e, name = prescan(content)
  47. if e != nil {
  48. return e, name, false
  49. }
  50. }
  51. // Try to detect UTF-8.
  52. // First eliminate any partial rune at the end.
  53. for i := len(content) - 1; i >= 0 && i > len(content)-4; i-- {
  54. b := content[i]
  55. if b < 0x80 {
  56. break
  57. }
  58. if utf8.RuneStart(b) {
  59. content = content[:i]
  60. break
  61. }
  62. }
  63. hasHighBit := false
  64. for _, c := range content {
  65. if c >= 0x80 {
  66. hasHighBit = true
  67. break
  68. }
  69. }
  70. if hasHighBit && utf8.Valid(content) {
  71. return encoding.Nop, "utf-8", false
  72. }
  73. // TODO: change default depending on user's locale?
  74. return charmap.Windows1252, "windows-1252", false
  75. }
  76. func prescan(content []byte) (e encoding.Encoding, name string) {
  77. z := html.NewTokenizer(bytes.NewReader(content))
  78. for {
  79. switch z.Next() {
  80. case html.ErrorToken:
  81. return nil, ""
  82. case html.StartTagToken, html.SelfClosingTagToken:
  83. tagName, hasAttr := z.TagName()
  84. if !bytes.Equal(tagName, []byte("meta")) {
  85. continue
  86. }
  87. attrList := make(map[string]bool)
  88. gotPragma := false
  89. const (
  90. dontKnow = iota
  91. doNeedPragma
  92. doNotNeedPragma
  93. )
  94. needPragma := dontKnow
  95. name = ""
  96. e = nil
  97. for hasAttr {
  98. var key, val []byte
  99. key, val, hasAttr = z.TagAttr()
  100. ks := string(key)
  101. if attrList[ks] {
  102. continue
  103. }
  104. attrList[ks] = true
  105. for i, c := range val {
  106. if 'A' <= c && c <= 'Z' {
  107. val[i] = c + 0x20
  108. }
  109. }
  110. switch ks {
  111. case "http-equiv":
  112. if bytes.Equal(val, []byte("content-type")) {
  113. gotPragma = true
  114. }
  115. case "content":
  116. if e == nil {
  117. name = fromMetaElement(string(val))
  118. if name != "" {
  119. e, name = Lookup(name)
  120. if e != nil {
  121. needPragma = doNeedPragma
  122. }
  123. }
  124. }
  125. case "charset":
  126. e, name = Lookup(string(val))
  127. needPragma = doNotNeedPragma
  128. }
  129. }
  130. if needPragma == dontKnow || needPragma == doNeedPragma && !gotPragma {
  131. continue
  132. }
  133. if strings.HasPrefix(name, "utf-16") {
  134. name = "utf-8"
  135. e = encoding.Nop
  136. }
  137. if e != nil {
  138. return e, name
  139. }
  140. }
  141. }
  142. }
  143. func fromMetaElement(s string) string {
  144. for s != "" {
  145. csLoc := strings.Index(s, "charset")
  146. if csLoc == -1 {
  147. return ""
  148. }
  149. s = s[csLoc+len("charset"):]
  150. s = strings.TrimLeft(s, " \t\n\f\r")
  151. if !strings.HasPrefix(s, "=") {
  152. continue
  153. }
  154. s = s[1:]
  155. s = strings.TrimLeft(s, " \t\n\f\r")
  156. if s == "" {
  157. return ""
  158. }
  159. if q := s[0]; q == '"' || q == '\'' {
  160. s = s[1:]
  161. closeQuote := strings.IndexRune(s, rune(q))
  162. if closeQuote == -1 {
  163. return ""
  164. }
  165. return s[:closeQuote]
  166. }
  167. end := strings.IndexAny(s, "; \t\n\f\r")
  168. if end == -1 {
  169. end = len(s)
  170. }
  171. return s[:end]
  172. }
  173. return ""
  174. }
  175. var boms = []struct {
  176. bom []byte
  177. enc string
  178. }{
  179. {[]byte{0xfe, 0xff}, "utf-16be"},
  180. {[]byte{0xff, 0xfe}, "utf-16le"},
  181. {[]byte{0xef, 0xbb, 0xbf}, "utf-8"},
  182. }