utf32.go 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296
  1. // Copyright 2016 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // Package utf32 provides the UTF-32 Unicode encoding.
  5. //
  6. // Please note that support for UTF-32 is discouraged as it is a rare and
  7. // inefficient encoding, unfit for use as an interchange format. For use
  8. // on the web, the W3C strongly discourages its use
  9. // (https://www.w3.org/TR/html5/document-metadata.html#charset)
  10. // while WHATWG directly prohibits supporting it
  11. // (https://html.spec.whatwg.org/multipage/syntax.html#character-encodings).
  12. package utf32 // import "golang.org/x/text/encoding/unicode/utf32"
  13. import (
  14. "errors"
  15. "unicode/utf8"
  16. "golang.org/x/text/encoding"
  17. "golang.org/x/text/encoding/internal/identifier"
  18. "golang.org/x/text/transform"
  19. )
  20. // All lists a configuration for each IANA-defined UTF-32 variant.
  21. var All = []encoding.Encoding{
  22. UTF32(BigEndian, UseBOM),
  23. UTF32(BigEndian, IgnoreBOM),
  24. UTF32(LittleEndian, IgnoreBOM),
  25. }
  26. // ErrMissingBOM means that decoding UTF-32 input with ExpectBOM did not
  27. // find a starting byte order mark.
  28. var ErrMissingBOM = errors.New("encoding: missing byte order mark")
  29. // UTF32 returns a UTF-32 Encoding for the given default endianness and
  30. // byte order mark (BOM) policy.
  31. //
  32. // When decoding from UTF-32 to UTF-8, if the BOMPolicy is IgnoreBOM then
  33. // neither BOMs U+FEFF nor ill-formed code units 0xFFFE0000 in the input
  34. // stream will affect the endianness used for decoding. Instead BOMs will
  35. // be output as their standard UTF-8 encoding "\xef\xbb\xbf" while
  36. // 0xFFFE0000 code units will be output as "\xef\xbf\xbd", the standard
  37. // UTF-8 encoding for the Unicode replacement character. If the BOMPolicy
  38. // is UseBOM or ExpectBOM a starting BOM is not written to the UTF-8
  39. // output. Instead, it overrides the default endianness e for the remainder
  40. // of the transformation. Any subsequent BOMs U+FEFF or ill-formed code
  41. // units 0xFFFE0000 will not affect the endianness used, and will instead
  42. // be output as their standard UTF-8 (replacement) encodings. For UseBOM,
  43. // if there is no starting BOM, it will proceed with the default
  44. // Endianness. For ExpectBOM, in that case, the transformation will return
  45. // early with an ErrMissingBOM error.
  46. //
  47. // When encoding from UTF-8 to UTF-32, a BOM will be inserted at the start
  48. // of the output if the BOMPolicy is UseBOM or ExpectBOM. Otherwise, a BOM
  49. // will not be inserted. The UTF-8 input does not need to contain a BOM.
  50. //
  51. // There is no concept of a 'native' endianness. If the UTF-32 data is
  52. // produced and consumed in a greater context that implies a certain
  53. // endianness, use IgnoreBOM. Otherwise, use ExpectBOM and always produce
  54. // and consume a BOM.
  55. //
  56. // In the language of https://www.unicode.org/faq/utf_bom.html#bom10,
  57. // IgnoreBOM corresponds to "Where the precise type of the data stream is
  58. // known... the BOM should not be used" and ExpectBOM corresponds to "A
  59. // particular protocol... may require use of the BOM".
  60. func UTF32(e Endianness, b BOMPolicy) encoding.Encoding {
  61. return utf32Encoding{config{e, b}, mibValue[e][b&bomMask]}
  62. }
  63. // mibValue maps Endianness and BOMPolicy settings to MIB constants for UTF-32.
  64. // Note that some configurations map to the same MIB identifier.
  65. var mibValue = map[Endianness][numBOMValues]identifier.MIB{
  66. BigEndian: [numBOMValues]identifier.MIB{
  67. IgnoreBOM: identifier.UTF32BE,
  68. UseBOM: identifier.UTF32,
  69. },
  70. LittleEndian: [numBOMValues]identifier.MIB{
  71. IgnoreBOM: identifier.UTF32LE,
  72. UseBOM: identifier.UTF32,
  73. },
  74. // ExpectBOM is not widely used and has no valid MIB identifier.
  75. }
  76. // BOMPolicy is a UTF-32 encodings's byte order mark policy.
  77. type BOMPolicy uint8
  78. const (
  79. writeBOM BOMPolicy = 0x01
  80. acceptBOM BOMPolicy = 0x02
  81. requireBOM BOMPolicy = 0x04
  82. bomMask BOMPolicy = 0x07
  83. // HACK: numBOMValues == 8 triggers a bug in the 1.4 compiler (cannot have a
  84. // map of an array of length 8 of a type that is also used as a key or value
  85. // in another map). See golang.org/issue/11354.
  86. // TODO: consider changing this value back to 8 if the use of 1.4.* has
  87. // been minimized.
  88. numBOMValues = 8 + 1
  89. // IgnoreBOM means to ignore any byte order marks.
  90. IgnoreBOM BOMPolicy = 0
  91. // Unicode-compliant interpretation for UTF-32BE/LE.
  92. // UseBOM means that the UTF-32 form may start with a byte order mark,
  93. // which will be used to override the default encoding.
  94. UseBOM BOMPolicy = writeBOM | acceptBOM
  95. // Unicode-compliant interpretation for UTF-32.
  96. // ExpectBOM means that the UTF-32 form must start with a byte order mark,
  97. // which will be used to override the default encoding.
  98. ExpectBOM BOMPolicy = writeBOM | acceptBOM | requireBOM
  99. // Consistent with BOMPolicy definition in golang.org/x/text/encoding/unicode
  100. )
  101. // Endianness is a UTF-32 encoding's default endianness.
  102. type Endianness bool
  103. const (
  104. // BigEndian is UTF-32BE.
  105. BigEndian Endianness = false
  106. // LittleEndian is UTF-32LE.
  107. LittleEndian Endianness = true
  108. )
  109. type config struct {
  110. endianness Endianness
  111. bomPolicy BOMPolicy
  112. }
  113. type utf32Encoding struct {
  114. config
  115. mib identifier.MIB
  116. }
  117. func (u utf32Encoding) NewDecoder() *encoding.Decoder {
  118. return &encoding.Decoder{Transformer: &utf32Decoder{
  119. initial: u.config,
  120. current: u.config,
  121. }}
  122. }
  123. func (u utf32Encoding) NewEncoder() *encoding.Encoder {
  124. return &encoding.Encoder{Transformer: &utf32Encoder{
  125. endianness: u.endianness,
  126. initialBOMPolicy: u.bomPolicy,
  127. currentBOMPolicy: u.bomPolicy,
  128. }}
  129. }
  130. func (u utf32Encoding) ID() (mib identifier.MIB, other string) {
  131. return u.mib, ""
  132. }
  133. func (u utf32Encoding) String() string {
  134. e, b := "B", ""
  135. if u.endianness == LittleEndian {
  136. e = "L"
  137. }
  138. switch u.bomPolicy {
  139. case ExpectBOM:
  140. b = "Expect"
  141. case UseBOM:
  142. b = "Use"
  143. case IgnoreBOM:
  144. b = "Ignore"
  145. }
  146. return "UTF-32" + e + "E (" + b + " BOM)"
  147. }
  148. type utf32Decoder struct {
  149. initial config
  150. current config
  151. }
  152. func (u *utf32Decoder) Reset() {
  153. u.current = u.initial
  154. }
  155. func (u *utf32Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
  156. if len(src) == 0 {
  157. if atEOF && u.current.bomPolicy&requireBOM != 0 {
  158. return 0, 0, ErrMissingBOM
  159. }
  160. return 0, 0, nil
  161. }
  162. if u.current.bomPolicy&acceptBOM != 0 {
  163. if len(src) < 4 {
  164. return 0, 0, transform.ErrShortSrc
  165. }
  166. switch {
  167. case src[0] == 0x00 && src[1] == 0x00 && src[2] == 0xfe && src[3] == 0xff:
  168. u.current.endianness = BigEndian
  169. nSrc = 4
  170. case src[0] == 0xff && src[1] == 0xfe && src[2] == 0x00 && src[3] == 0x00:
  171. u.current.endianness = LittleEndian
  172. nSrc = 4
  173. default:
  174. if u.current.bomPolicy&requireBOM != 0 {
  175. return 0, 0, ErrMissingBOM
  176. }
  177. }
  178. u.current.bomPolicy = IgnoreBOM
  179. }
  180. var r rune
  181. var dSize, sSize int
  182. for nSrc < len(src) {
  183. if nSrc+3 < len(src) {
  184. x := uint32(src[nSrc+0])<<24 | uint32(src[nSrc+1])<<16 |
  185. uint32(src[nSrc+2])<<8 | uint32(src[nSrc+3])
  186. if u.current.endianness == LittleEndian {
  187. x = x>>24 | (x >> 8 & 0x0000FF00) | (x << 8 & 0x00FF0000) | x<<24
  188. }
  189. r, sSize = rune(x), 4
  190. if dSize = utf8.RuneLen(r); dSize < 0 {
  191. r, dSize = utf8.RuneError, 3
  192. }
  193. } else if atEOF {
  194. // 1..3 trailing bytes.
  195. r, dSize, sSize = utf8.RuneError, 3, len(src)-nSrc
  196. } else {
  197. err = transform.ErrShortSrc
  198. break
  199. }
  200. if nDst+dSize > len(dst) {
  201. err = transform.ErrShortDst
  202. break
  203. }
  204. nDst += utf8.EncodeRune(dst[nDst:], r)
  205. nSrc += sSize
  206. }
  207. return nDst, nSrc, err
  208. }
  209. type utf32Encoder struct {
  210. endianness Endianness
  211. initialBOMPolicy BOMPolicy
  212. currentBOMPolicy BOMPolicy
  213. }
  214. func (u *utf32Encoder) Reset() {
  215. u.currentBOMPolicy = u.initialBOMPolicy
  216. }
  217. func (u *utf32Encoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
  218. if u.currentBOMPolicy&writeBOM != 0 {
  219. if len(dst) < 4 {
  220. return 0, 0, transform.ErrShortDst
  221. }
  222. dst[0], dst[1], dst[2], dst[3] = 0x00, 0x00, 0xfe, 0xff
  223. u.currentBOMPolicy = IgnoreBOM
  224. nDst = 4
  225. }
  226. r, size := rune(0), 0
  227. for nSrc < len(src) {
  228. r = rune(src[nSrc])
  229. // Decode a 1-byte rune.
  230. if r < utf8.RuneSelf {
  231. size = 1
  232. } else {
  233. // Decode a multi-byte rune.
  234. r, size = utf8.DecodeRune(src[nSrc:])
  235. if size == 1 {
  236. // All valid runes of size 1 (those below utf8.RuneSelf) were
  237. // handled above. We have invalid UTF-8 or we haven't seen the
  238. // full character yet.
  239. if !atEOF && !utf8.FullRune(src[nSrc:]) {
  240. err = transform.ErrShortSrc
  241. break
  242. }
  243. }
  244. }
  245. if nDst+4 > len(dst) {
  246. err = transform.ErrShortDst
  247. break
  248. }
  249. dst[nDst+0] = uint8(r >> 24)
  250. dst[nDst+1] = uint8(r >> 16)
  251. dst[nDst+2] = uint8(r >> 8)
  252. dst[nDst+3] = uint8(r)
  253. nDst += 4
  254. nSrc += size
  255. }
  256. if u.endianness == LittleEndian {
  257. for i := 0; i < nDst; i += 4 {
  258. dst[i], dst[i+1], dst[i+2], dst[i+3] = dst[i+3], dst[i+2], dst[i+1], dst[i]
  259. }
  260. }
  261. return nDst, nSrc, err
  262. }