utf32_test.go 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248
  1. // Copyright 2016 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package utf32
  5. import (
  6. "testing"
  7. "golang.org/x/text/encoding"
  8. "golang.org/x/text/encoding/internal/enctest"
  9. "golang.org/x/text/transform"
  10. )
  11. var (
  12. utf32LEIB = UTF32(LittleEndian, IgnoreBOM) // UTF-32LE (atypical interpretation)
  13. utf32LEUB = UTF32(LittleEndian, UseBOM) // UTF-32, LE
  14. // utf32LEEB = UTF32(LittleEndian, ExpectBOM) // UTF-32, LE, Expect - covered in encoding_test.go
  15. utf32BEIB = UTF32(BigEndian, IgnoreBOM) // UTF-32BE (atypical interpretation)
  16. utf32BEUB = UTF32(BigEndian, UseBOM) // UTF-32 default
  17. utf32BEEB = UTF32(BigEndian, ExpectBOM) // UTF-32 Expect
  18. )
  19. func TestBasics(t *testing.T) {
  20. testCases := []struct {
  21. e encoding.Encoding
  22. encPrefix string
  23. encSuffix string
  24. encoded string
  25. utf8 string
  26. }{{
  27. e: utf32BEIB,
  28. encoded: "\x00\x00\x00\x57\x00\x00\x00\xe4\x00\x01\xd5\x65",
  29. utf8: "\x57\u00e4\U0001d565",
  30. }, {
  31. e: UTF32(BigEndian, ExpectBOM),
  32. encPrefix: "\x00\x00\xfe\xff",
  33. encoded: "\x00\x00\x00\x57\x00\x00\x00\xe4\x00\x01\xd5\x65",
  34. utf8: "\x57\u00e4\U0001d565",
  35. }, {
  36. e: UTF32(LittleEndian, IgnoreBOM),
  37. encoded: "\x57\x00\x00\x00\xe4\x00\x00\x00\x65\xd5\x01\x00",
  38. utf8: "\x57\u00e4\U0001d565",
  39. }, {
  40. e: UTF32(LittleEndian, ExpectBOM),
  41. encPrefix: "\xff\xfe\x00\x00",
  42. encoded: "\x57\x00\x00\x00\xe4\x00\x00\x00\x65\xd5\x01\x00",
  43. utf8: "\x57\u00e4\U0001d565",
  44. }}
  45. for _, tc := range testCases {
  46. enctest.TestEncoding(t, tc.e, tc.encoded, tc.utf8, tc.encPrefix, tc.encSuffix)
  47. }
  48. }
  49. func TestFiles(t *testing.T) { enctest.TestFile(t, utf32BEIB) }
  50. func BenchmarkEncoding(b *testing.B) { enctest.Benchmark(b, utf32BEIB) }
  51. func TestUTF32(t *testing.T) {
  52. testCases := []struct {
  53. desc string
  54. src string
  55. notEOF bool // the inverse of atEOF
  56. sizeDst int
  57. want string
  58. nSrc int
  59. err error
  60. t transform.Transformer
  61. }{{
  62. desc: "utf-32 IgnoreBOM dec: empty string",
  63. t: utf32BEIB.NewDecoder(),
  64. }, {
  65. desc: "utf-32 UseBOM dec: empty string",
  66. t: utf32BEUB.NewDecoder(),
  67. }, {
  68. desc: "utf-32 ExpectBOM dec: empty string",
  69. err: ErrMissingBOM,
  70. t: utf32BEEB.NewDecoder(),
  71. }, {
  72. desc: "utf-32be dec: Doesn't interpret U+FEFF as BOM",
  73. src: "\x00\x00\xFE\xFF\x00\x01\x23\x45\x00\x00\x00\x3D\x00\x00\x00\x52\x00\x00\x00\x61",
  74. sizeDst: 100,
  75. want: "\uFEFF\U00012345=Ra",
  76. nSrc: 20,
  77. t: utf32BEIB.NewDecoder(),
  78. }, {
  79. desc: "utf-32be dec: Interprets little endian U+FEFF as invalid",
  80. src: "\xFF\xFE\x00\x00\x00\x01\x23\x45\x00\x00\x00\x3D\x00\x00\x00\x52\x00\x00\x00\x61",
  81. sizeDst: 100,
  82. want: "\uFFFD\U00012345=Ra",
  83. nSrc: 20,
  84. t: utf32BEIB.NewDecoder(),
  85. }, {
  86. desc: "utf-32le dec: Doesn't interpret U+FEFF as BOM",
  87. src: "\xFF\xFE\x00\x00\x45\x23\x01\x00\x3D\x00\x00\x00\x52\x00\x00\x00\x61\x00\x00\x00",
  88. sizeDst: 100,
  89. want: "\uFEFF\U00012345=Ra",
  90. nSrc: 20,
  91. t: utf32LEIB.NewDecoder(),
  92. }, {
  93. desc: "utf-32le dec: Interprets big endian U+FEFF as invalid",
  94. src: "\x00\x00\xFE\xFF\x45\x23\x01\x00\x3D\x00\x00\x00\x52\x00\x00\x00\x61\x00\x00\x00",
  95. sizeDst: 100,
  96. want: "\uFFFD\U00012345=Ra",
  97. nSrc: 20,
  98. t: utf32LEIB.NewDecoder(),
  99. }, {
  100. desc: "utf-32 enc: Writes big-endian BOM",
  101. src: "\U00012345=Ra",
  102. sizeDst: 100,
  103. want: "\x00\x00\xFE\xFF\x00\x01\x23\x45\x00\x00\x00\x3D\x00\x00\x00\x52\x00\x00\x00\x61",
  104. nSrc: 7,
  105. t: utf32BEUB.NewEncoder(),
  106. }, {
  107. desc: "utf-32 enc: Writes little-endian BOM",
  108. src: "\U00012345=Ra",
  109. sizeDst: 100,
  110. want: "\xFF\xFE\x00\x00\x45\x23\x01\x00\x3D\x00\x00\x00\x52\x00\x00\x00\x61\x00\x00\x00",
  111. nSrc: 7,
  112. t: utf32LEUB.NewEncoder(),
  113. }, {
  114. desc: "utf-32 dec: Interprets text using big-endian default when BOM not present",
  115. src: "\x00\x01\x23\x45\x00\x00\x00\x3D\x00\x00\x00\x52\x00\x00\x00\x61",
  116. sizeDst: 100,
  117. want: "\U00012345=Ra",
  118. nSrc: 16,
  119. t: utf32BEUB.NewDecoder(),
  120. }, {
  121. desc: "utf-32 dec: Interprets text using little-endian default when BOM not present",
  122. src: "\x45\x23\x01\x00\x3D\x00\x00\x00\x52\x00\x00\x00\x61\x00\x00\x00",
  123. sizeDst: 100,
  124. want: "\U00012345=Ra",
  125. nSrc: 16,
  126. t: utf32LEUB.NewDecoder(),
  127. }, {
  128. desc: "utf-32 dec: BOM determines encoding BE",
  129. src: "\x00\x00\xFE\xFF\x00\x01\x23\x45\x00\x00\x00\x3D\x00\x00\x00\x52\x00\x00\x00\x61",
  130. sizeDst: 100,
  131. want: "\U00012345=Ra",
  132. nSrc: 20,
  133. t: utf32BEUB.NewDecoder(),
  134. }, {
  135. desc: "utf-32 dec: BOM determines encoding LE",
  136. src: "\xFF\xFE\x00\x00\x45\x23\x01\x00\x3D\x00\x00\x00\x52\x00\x00\x00\x61\x00\x00\x00",
  137. sizeDst: 100,
  138. want: "\U00012345=Ra",
  139. nSrc: 20,
  140. t: utf32LEUB.NewDecoder(),
  141. }, {
  142. desc: "utf-32 dec: BOM determines encoding LE, change default",
  143. src: "\xFF\xFE\x00\x00\x45\x23\x01\x00\x3D\x00\x00\x00\x52\x00\x00\x00\x61\x00\x00\x00",
  144. sizeDst: 100,
  145. want: "\U00012345=Ra",
  146. nSrc: 20,
  147. t: utf32BEUB.NewDecoder(),
  148. }, {
  149. desc: "utf-32 dec: BOM determines encoding BE, change default",
  150. src: "\x00\x00\xFE\xFF\x00\x01\x23\x45\x00\x00\x00\x3D\x00\x00\x00\x52\x00\x00\x00\x61",
  151. sizeDst: 100,
  152. want: "\U00012345=Ra",
  153. nSrc: 20,
  154. t: utf32LEUB.NewDecoder(),
  155. }, {
  156. desc: "utf-32 dec: Don't change big-endian byte order mid-stream",
  157. src: "\x00\x01\x23\x45\x00\x00\x00\x3D\xFF\xFE\x00\x00\x00\x00\xFE\xFF\x00\x00\x00\x52\x00\x00\x00\x61",
  158. sizeDst: 100,
  159. want: "\U00012345=\uFFFD\uFEFFRa",
  160. nSrc: 24,
  161. t: utf32BEUB.NewDecoder(),
  162. }, {
  163. desc: "utf-32 dec: Don't change little-endian byte order mid-stream",
  164. src: "\x45\x23\x01\x00\x3D\x00\x00\x00\x00\x00\xFE\xFF\xFF\xFE\x00\x00\x52\x00\x00\x00\x61\x00\x00\x00",
  165. sizeDst: 100,
  166. want: "\U00012345=\uFFFD\uFEFFRa",
  167. nSrc: 24,
  168. t: utf32LEUB.NewDecoder(),
  169. }, {
  170. desc: "utf-32 dec: Fail on missing BOM when required",
  171. src: "\x00\x01\x23\x45\x00\x00\x00\x3D\x00\x00\x00\x52\x00\x00\x00\x61",
  172. sizeDst: 100,
  173. want: "",
  174. nSrc: 0,
  175. err: ErrMissingBOM,
  176. t: utf32BEEB.NewDecoder(),
  177. }, {
  178. desc: "utf-32 enc: Short dst",
  179. src: "\U00012345=Ra",
  180. sizeDst: 15,
  181. want: "\x00\x01\x23\x45\x00\x00\x00\x3D\x00\x00\x00\x52",
  182. nSrc: 6,
  183. err: transform.ErrShortDst,
  184. t: utf32BEIB.NewEncoder(),
  185. }, {
  186. desc: "utf-32 enc: Short src",
  187. src: "\U00012345=Ra\xC2",
  188. notEOF: true,
  189. sizeDst: 100,
  190. want: "\x00\x01\x23\x45\x00\x00\x00\x3D\x00\x00\x00\x52\x00\x00\x00\x61",
  191. nSrc: 7,
  192. err: transform.ErrShortSrc,
  193. t: utf32BEIB.NewEncoder(),
  194. }, {
  195. desc: "utf-32 enc: Invalid input",
  196. src: "\x80\xC1\xC2\x7F\xC2",
  197. sizeDst: 100,
  198. want: "\x00\x00\xFF\xFD\x00\x00\xFF\xFD\x00\x00\xFF\xFD\x00\x00\x00\x7F\x00\x00\xFF\xFD",
  199. nSrc: 5,
  200. t: utf32BEIB.NewEncoder(),
  201. }, {
  202. desc: "utf-32 dec: Short dst",
  203. src: "\x00\x00\x00\x41",
  204. sizeDst: 0,
  205. want: "",
  206. nSrc: 0,
  207. err: transform.ErrShortDst,
  208. t: utf32BEIB.NewDecoder(),
  209. }, {
  210. desc: "utf-32 dec: Short src",
  211. src: "\x00\x00\x00",
  212. notEOF: true,
  213. sizeDst: 4,
  214. want: "",
  215. nSrc: 0,
  216. err: transform.ErrShortSrc,
  217. t: utf32BEIB.NewDecoder(),
  218. }, {
  219. desc: "utf-32 dec: Invalid input",
  220. src: "\x00\x00\xD8\x00\x00\x00\xDF\xFF\x00\x11\x00\x00\x00\x00\x00",
  221. sizeDst: 100,
  222. want: "\uFFFD\uFFFD\uFFFD\uFFFD",
  223. nSrc: 15,
  224. t: utf32BEIB.NewDecoder(),
  225. }}
  226. for i, tc := range testCases {
  227. b := make([]byte, tc.sizeDst)
  228. nDst, nSrc, err := tc.t.Transform(b, []byte(tc.src), !tc.notEOF)
  229. if err != tc.err {
  230. t.Errorf("%d:%s: error was %v; want %v", i, tc.desc, err, tc.err)
  231. }
  232. if got := string(b[:nDst]); got != tc.want {
  233. t.Errorf("%d:%s: result was %q: want %q", i, tc.desc, got, tc.want)
  234. }
  235. if nSrc != tc.nSrc {
  236. t.Errorf("%d:%s: nSrc was %d; want %d", i, tc.desc, nSrc, tc.nSrc)
  237. }
  238. }
  239. }