unicode.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434
  1. // Copyright 2013 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // Package unicode provides Unicode encodings such as UTF-16.
  5. package unicode // import "golang.org/x/text/encoding/unicode"
  6. import (
  7. "errors"
  8. "unicode/utf16"
  9. "unicode/utf8"
  10. "golang.org/x/text/encoding"
  11. "golang.org/x/text/encoding/internal"
  12. "golang.org/x/text/encoding/internal/identifier"
  13. "golang.org/x/text/internal/utf8internal"
  14. "golang.org/x/text/runes"
  15. "golang.org/x/text/transform"
  16. )
  17. // TODO: I think the Transformers really should return errors on unmatched
  18. // surrogate pairs and odd numbers of bytes. This is not required by RFC 2781,
  19. // which leaves it open, but is suggested by WhatWG. It will allow for all error
  20. // modes as defined by WhatWG: fatal, HTML and Replacement. This would require
  21. // the introduction of some kind of error type for conveying the erroneous code
  22. // point.
  23. // UTF8 is the UTF-8 encoding.
  24. var UTF8 encoding.Encoding = utf8enc
  25. var utf8enc = &internal.Encoding{
  26. &internal.SimpleEncoding{utf8Decoder{}, runes.ReplaceIllFormed()},
  27. "UTF-8",
  28. identifier.UTF8,
  29. }
  30. type utf8Decoder struct{ transform.NopResetter }
  31. func (utf8Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
  32. var pSrc int // point from which to start copy in src
  33. var accept utf8internal.AcceptRange
  34. // The decoder can only make the input larger, not smaller.
  35. n := len(src)
  36. if len(dst) < n {
  37. err = transform.ErrShortDst
  38. n = len(dst)
  39. atEOF = false
  40. }
  41. for nSrc < n {
  42. c := src[nSrc]
  43. if c < utf8.RuneSelf {
  44. nSrc++
  45. continue
  46. }
  47. first := utf8internal.First[c]
  48. size := int(first & utf8internal.SizeMask)
  49. if first == utf8internal.FirstInvalid {
  50. goto handleInvalid // invalid starter byte
  51. }
  52. accept = utf8internal.AcceptRanges[first>>utf8internal.AcceptShift]
  53. if nSrc+size > n {
  54. if !atEOF {
  55. // We may stop earlier than necessary here if the short sequence
  56. // has invalid bytes. Not checking for this simplifies the code
  57. // and may avoid duplicate computations in certain conditions.
  58. if err == nil {
  59. err = transform.ErrShortSrc
  60. }
  61. break
  62. }
  63. // Determine the maximal subpart of an ill-formed subsequence.
  64. switch {
  65. case nSrc+1 >= n || src[nSrc+1] < accept.Lo || accept.Hi < src[nSrc+1]:
  66. size = 1
  67. case nSrc+2 >= n || src[nSrc+2] < utf8internal.LoCB || utf8internal.HiCB < src[nSrc+2]:
  68. size = 2
  69. default:
  70. size = 3 // As we are short, the maximum is 3.
  71. }
  72. goto handleInvalid
  73. }
  74. if c = src[nSrc+1]; c < accept.Lo || accept.Hi < c {
  75. size = 1
  76. goto handleInvalid // invalid continuation byte
  77. } else if size == 2 {
  78. } else if c = src[nSrc+2]; c < utf8internal.LoCB || utf8internal.HiCB < c {
  79. size = 2
  80. goto handleInvalid // invalid continuation byte
  81. } else if size == 3 {
  82. } else if c = src[nSrc+3]; c < utf8internal.LoCB || utf8internal.HiCB < c {
  83. size = 3
  84. goto handleInvalid // invalid continuation byte
  85. }
  86. nSrc += size
  87. continue
  88. handleInvalid:
  89. // Copy the scanned input so far.
  90. nDst += copy(dst[nDst:], src[pSrc:nSrc])
  91. // Append RuneError to the destination.
  92. const runeError = "\ufffd"
  93. if nDst+len(runeError) > len(dst) {
  94. return nDst, nSrc, transform.ErrShortDst
  95. }
  96. nDst += copy(dst[nDst:], runeError)
  97. // Skip the maximal subpart of an ill-formed subsequence according to
  98. // the W3C standard way instead of the Go way. This Transform is
  99. // probably the only place in the text repo where it is warranted.
  100. nSrc += size
  101. pSrc = nSrc
  102. // Recompute the maximum source length.
  103. if sz := len(dst) - nDst; sz < len(src)-nSrc {
  104. err = transform.ErrShortDst
  105. n = nSrc + sz
  106. atEOF = false
  107. }
  108. }
  109. return nDst + copy(dst[nDst:], src[pSrc:nSrc]), nSrc, err
  110. }
  111. // UTF16 returns a UTF-16 Encoding for the given default endianness and byte
  112. // order mark (BOM) policy.
  113. //
  114. // When decoding from UTF-16 to UTF-8, if the BOMPolicy is IgnoreBOM then
  115. // neither BOMs U+FEFF nor noncharacters U+FFFE in the input stream will affect
  116. // the endianness used for decoding, and will instead be output as their
  117. // standard UTF-8 encodings: "\xef\xbb\xbf" and "\xef\xbf\xbe". If the BOMPolicy
  118. // is UseBOM or ExpectBOM a staring BOM is not written to the UTF-8 output.
  119. // Instead, it overrides the default endianness e for the remainder of the
  120. // transformation. Any subsequent BOMs U+FEFF or noncharacters U+FFFE will not
  121. // affect the endianness used, and will instead be output as their standard
  122. // UTF-8 encodings. For UseBOM, if there is no starting BOM, it will proceed
  123. // with the default Endianness. For ExpectBOM, in that case, the transformation
  124. // will return early with an ErrMissingBOM error.
  125. //
  126. // When encoding from UTF-8 to UTF-16, a BOM will be inserted at the start of
  127. // the output if the BOMPolicy is UseBOM or ExpectBOM. Otherwise, a BOM will not
  128. // be inserted. The UTF-8 input does not need to contain a BOM.
  129. //
  130. // There is no concept of a 'native' endianness. If the UTF-16 data is produced
  131. // and consumed in a greater context that implies a certain endianness, use
  132. // IgnoreBOM. Otherwise, use ExpectBOM and always produce and consume a BOM.
  133. //
  134. // In the language of https://www.unicode.org/faq/utf_bom.html#bom10, IgnoreBOM
  135. // corresponds to "Where the precise type of the data stream is known... the
  136. // BOM should not be used" and ExpectBOM corresponds to "A particular
  137. // protocol... may require use of the BOM".
  138. func UTF16(e Endianness, b BOMPolicy) encoding.Encoding {
  139. return utf16Encoding{config{e, b}, mibValue[e][b&bomMask]}
  140. }
  141. // mibValue maps Endianness and BOMPolicy settings to MIB constants. Note that
  142. // some configurations map to the same MIB identifier. RFC 2781 has requirements
  143. // and recommendations. Some of the "configurations" are merely recommendations,
  144. // so multiple configurations could match.
  145. var mibValue = map[Endianness][numBOMValues]identifier.MIB{
  146. BigEndian: [numBOMValues]identifier.MIB{
  147. IgnoreBOM: identifier.UTF16BE,
  148. UseBOM: identifier.UTF16, // BigEnding default is preferred by RFC 2781.
  149. // TODO: acceptBOM | strictBOM would map to UTF16BE as well.
  150. },
  151. LittleEndian: [numBOMValues]identifier.MIB{
  152. IgnoreBOM: identifier.UTF16LE,
  153. UseBOM: identifier.UTF16, // LittleEndian default is allowed and preferred on Windows.
  154. // TODO: acceptBOM | strictBOM would map to UTF16LE as well.
  155. },
  156. // ExpectBOM is not widely used and has no valid MIB identifier.
  157. }
  158. // All lists a configuration for each IANA-defined UTF-16 variant.
  159. var All = []encoding.Encoding{
  160. UTF8,
  161. UTF16(BigEndian, UseBOM),
  162. UTF16(BigEndian, IgnoreBOM),
  163. UTF16(LittleEndian, IgnoreBOM),
  164. }
  165. // BOMPolicy is a UTF-16 encoding's byte order mark policy.
  166. type BOMPolicy uint8
  167. const (
  168. writeBOM BOMPolicy = 0x01
  169. acceptBOM BOMPolicy = 0x02
  170. requireBOM BOMPolicy = 0x04
  171. bomMask BOMPolicy = 0x07
  172. // HACK: numBOMValues == 8 triggers a bug in the 1.4 compiler (cannot have a
  173. // map of an array of length 8 of a type that is also used as a key or value
  174. // in another map). See golang.org/issue/11354.
  175. // TODO: consider changing this value back to 8 if the use of 1.4.* has
  176. // been minimized.
  177. numBOMValues = 8 + 1
  178. // IgnoreBOM means to ignore any byte order marks.
  179. IgnoreBOM BOMPolicy = 0
  180. // Common and RFC 2781-compliant interpretation for UTF-16BE/LE.
  181. // UseBOM means that the UTF-16 form may start with a byte order mark, which
  182. // will be used to override the default encoding.
  183. UseBOM BOMPolicy = writeBOM | acceptBOM
  184. // Common and RFC 2781-compliant interpretation for UTF-16.
  185. // ExpectBOM means that the UTF-16 form must start with a byte order mark,
  186. // which will be used to override the default encoding.
  187. ExpectBOM BOMPolicy = writeBOM | acceptBOM | requireBOM
  188. // Used in Java as Unicode (not to be confused with Java's UTF-16) and
  189. // ICU's UTF-16,version=1. Not compliant with RFC 2781.
  190. // TODO (maybe): strictBOM: BOM must match Endianness. This would allow:
  191. // - UTF-16(B|L)E,version=1: writeBOM | acceptBOM | requireBOM | strictBOM
  192. // (UnicodeBig and UnicodeLittle in Java)
  193. // - RFC 2781-compliant, but less common interpretation for UTF-16(B|L)E:
  194. // acceptBOM | strictBOM (e.g. assigned to CheckBOM).
  195. // This addition would be consistent with supporting ExpectBOM.
  196. )
  197. // Endianness is a UTF-16 encoding's default endianness.
  198. type Endianness bool
  199. const (
  200. // BigEndian is UTF-16BE.
  201. BigEndian Endianness = false
  202. // LittleEndian is UTF-16LE.
  203. LittleEndian Endianness = true
  204. )
  205. // ErrMissingBOM means that decoding UTF-16 input with ExpectBOM did not find a
  206. // starting byte order mark.
  207. var ErrMissingBOM = errors.New("encoding: missing byte order mark")
  208. type utf16Encoding struct {
  209. config
  210. mib identifier.MIB
  211. }
  212. type config struct {
  213. endianness Endianness
  214. bomPolicy BOMPolicy
  215. }
  216. func (u utf16Encoding) NewDecoder() *encoding.Decoder {
  217. return &encoding.Decoder{Transformer: &utf16Decoder{
  218. initial: u.config,
  219. current: u.config,
  220. }}
  221. }
  222. func (u utf16Encoding) NewEncoder() *encoding.Encoder {
  223. return &encoding.Encoder{Transformer: &utf16Encoder{
  224. endianness: u.endianness,
  225. initialBOMPolicy: u.bomPolicy,
  226. currentBOMPolicy: u.bomPolicy,
  227. }}
  228. }
  229. func (u utf16Encoding) ID() (mib identifier.MIB, other string) {
  230. return u.mib, ""
  231. }
  232. func (u utf16Encoding) String() string {
  233. e, b := "B", ""
  234. if u.endianness == LittleEndian {
  235. e = "L"
  236. }
  237. switch u.bomPolicy {
  238. case ExpectBOM:
  239. b = "Expect"
  240. case UseBOM:
  241. b = "Use"
  242. case IgnoreBOM:
  243. b = "Ignore"
  244. }
  245. return "UTF-16" + e + "E (" + b + " BOM)"
  246. }
  247. type utf16Decoder struct {
  248. initial config
  249. current config
  250. }
  251. func (u *utf16Decoder) Reset() {
  252. u.current = u.initial
  253. }
  254. func (u *utf16Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
  255. if len(src) == 0 {
  256. if atEOF && u.current.bomPolicy&requireBOM != 0 {
  257. return 0, 0, ErrMissingBOM
  258. }
  259. return 0, 0, nil
  260. }
  261. if u.current.bomPolicy&acceptBOM != 0 {
  262. if len(src) < 2 {
  263. return 0, 0, transform.ErrShortSrc
  264. }
  265. switch {
  266. case src[0] == 0xfe && src[1] == 0xff:
  267. u.current.endianness = BigEndian
  268. nSrc = 2
  269. case src[0] == 0xff && src[1] == 0xfe:
  270. u.current.endianness = LittleEndian
  271. nSrc = 2
  272. default:
  273. if u.current.bomPolicy&requireBOM != 0 {
  274. return 0, 0, ErrMissingBOM
  275. }
  276. }
  277. u.current.bomPolicy = IgnoreBOM
  278. }
  279. var r rune
  280. var dSize, sSize int
  281. for nSrc < len(src) {
  282. if nSrc+1 < len(src) {
  283. x := uint16(src[nSrc+0])<<8 | uint16(src[nSrc+1])
  284. if u.current.endianness == LittleEndian {
  285. x = x>>8 | x<<8
  286. }
  287. r, sSize = rune(x), 2
  288. if utf16.IsSurrogate(r) {
  289. if nSrc+3 < len(src) {
  290. x = uint16(src[nSrc+2])<<8 | uint16(src[nSrc+3])
  291. if u.current.endianness == LittleEndian {
  292. x = x>>8 | x<<8
  293. }
  294. // Save for next iteration if it is not a high surrogate.
  295. if isHighSurrogate(rune(x)) {
  296. r, sSize = utf16.DecodeRune(r, rune(x)), 4
  297. }
  298. } else if !atEOF {
  299. err = transform.ErrShortSrc
  300. break
  301. }
  302. }
  303. if dSize = utf8.RuneLen(r); dSize < 0 {
  304. r, dSize = utf8.RuneError, 3
  305. }
  306. } else if atEOF {
  307. // Single trailing byte.
  308. r, dSize, sSize = utf8.RuneError, 3, 1
  309. } else {
  310. err = transform.ErrShortSrc
  311. break
  312. }
  313. if nDst+dSize > len(dst) {
  314. err = transform.ErrShortDst
  315. break
  316. }
  317. nDst += utf8.EncodeRune(dst[nDst:], r)
  318. nSrc += sSize
  319. }
  320. return nDst, nSrc, err
  321. }
  322. func isHighSurrogate(r rune) bool {
  323. return 0xDC00 <= r && r <= 0xDFFF
  324. }
  325. type utf16Encoder struct {
  326. endianness Endianness
  327. initialBOMPolicy BOMPolicy
  328. currentBOMPolicy BOMPolicy
  329. }
  330. func (u *utf16Encoder) Reset() {
  331. u.currentBOMPolicy = u.initialBOMPolicy
  332. }
  333. func (u *utf16Encoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
  334. if u.currentBOMPolicy&writeBOM != 0 {
  335. if len(dst) < 2 {
  336. return 0, 0, transform.ErrShortDst
  337. }
  338. dst[0], dst[1] = 0xfe, 0xff
  339. u.currentBOMPolicy = IgnoreBOM
  340. nDst = 2
  341. }
  342. r, size := rune(0), 0
  343. for nSrc < len(src) {
  344. r = rune(src[nSrc])
  345. // Decode a 1-byte rune.
  346. if r < utf8.RuneSelf {
  347. size = 1
  348. } else {
  349. // Decode a multi-byte rune.
  350. r, size = utf8.DecodeRune(src[nSrc:])
  351. if size == 1 {
  352. // All valid runes of size 1 (those below utf8.RuneSelf) were
  353. // handled above. We have invalid UTF-8 or we haven't seen the
  354. // full character yet.
  355. if !atEOF && !utf8.FullRune(src[nSrc:]) {
  356. err = transform.ErrShortSrc
  357. break
  358. }
  359. }
  360. }
  361. if r <= 0xffff {
  362. if nDst+2 > len(dst) {
  363. err = transform.ErrShortDst
  364. break
  365. }
  366. dst[nDst+0] = uint8(r >> 8)
  367. dst[nDst+1] = uint8(r)
  368. nDst += 2
  369. } else {
  370. if nDst+4 > len(dst) {
  371. err = transform.ErrShortDst
  372. break
  373. }
  374. r1, r2 := utf16.EncodeRune(r)
  375. dst[nDst+0] = uint8(r1 >> 8)
  376. dst[nDst+1] = uint8(r1)
  377. dst[nDst+2] = uint8(r2 >> 8)
  378. dst[nDst+3] = uint8(r2)
  379. nDst += 4
  380. }
  381. nSrc += size
  382. }
  383. if u.endianness == LittleEndian {
  384. for i := 0; i < nDst; i += 2 {
  385. dst[i], dst[i+1] = dst[i+1], dst[i]
  386. }
  387. }
  388. return nDst, nSrc, err
  389. }