encoding_test.go 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290
  1. // Copyright 2013 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package encoding_test
  5. import (
  6. "io/ioutil"
  7. "strings"
  8. "testing"
  9. "golang.org/x/text/encoding"
  10. "golang.org/x/text/encoding/charmap"
  11. "golang.org/x/text/transform"
  12. )
  13. func TestEncodeInvalidUTF8(t *testing.T) {
  14. inputs := []string{
  15. "hello.",
  16. "wo\ufffdld.",
  17. "ABC\xff\x80\x80", // Invalid UTF-8.
  18. "\x80\x80\x80\x80\x80",
  19. "\x80\x80D\x80\x80", // Valid rune at "D".
  20. "E\xed\xa0\x80\xed\xbf\xbfF", // Two invalid UTF-8 runes (surrogates).
  21. "G",
  22. "H\xe2\x82", // U+20AC in UTF-8 is "\xe2\x82\xac", which we split over two
  23. "\xacI\xe2\x82", // input lines. It maps to 0x80 in the Windows-1252 encoding.
  24. }
  25. // Each invalid source byte becomes '\x1a'.
  26. want := strings.Replace("hello.wo?ld.ABC??????????D??E??????FGH\x80I??", "?", "\x1a", -1)
  27. transformer := encoding.ReplaceUnsupported(charmap.Windows1252.NewEncoder())
  28. gotBuf := make([]byte, 0, 1024)
  29. src := make([]byte, 0, 1024)
  30. for i, input := range inputs {
  31. dst := make([]byte, 1024)
  32. src = append(src, input...)
  33. atEOF := i == len(inputs)-1
  34. nDst, nSrc, err := transformer.Transform(dst, src, atEOF)
  35. gotBuf = append(gotBuf, dst[:nDst]...)
  36. src = src[nSrc:]
  37. if err != nil && err != transform.ErrShortSrc {
  38. t.Fatalf("i=%d: %v", i, err)
  39. }
  40. if atEOF && err != nil {
  41. t.Fatalf("i=%d: atEOF: %v", i, err)
  42. }
  43. }
  44. if got := string(gotBuf); got != want {
  45. t.Fatalf("\ngot %+q\nwant %+q", got, want)
  46. }
  47. }
  48. func TestReplacement(t *testing.T) {
  49. for _, direction := range []string{"Decode", "Encode"} {
  50. enc, want := (transform.Transformer)(nil), ""
  51. if direction == "Decode" {
  52. enc = encoding.Replacement.NewDecoder()
  53. want = "\ufffd"
  54. } else {
  55. enc = encoding.Replacement.NewEncoder()
  56. want = "AB\x00CD\ufffdYZ"
  57. }
  58. sr := strings.NewReader("AB\x00CD\x80YZ")
  59. g, err := ioutil.ReadAll(transform.NewReader(sr, enc))
  60. if err != nil {
  61. t.Errorf("%s: ReadAll: %v", direction, err)
  62. continue
  63. }
  64. if got := string(g); got != want {
  65. t.Errorf("%s:\ngot %q\nwant %q", direction, got, want)
  66. continue
  67. }
  68. }
  69. }
  70. func TestUTF8Validator(t *testing.T) {
  71. testCases := []struct {
  72. desc string
  73. dstSize int
  74. src string
  75. atEOF bool
  76. want string
  77. wantErr error
  78. }{
  79. {
  80. "empty input",
  81. 100,
  82. "",
  83. false,
  84. "",
  85. nil,
  86. },
  87. {
  88. "valid 1-byte 1-rune input",
  89. 100,
  90. "a",
  91. false,
  92. "a",
  93. nil,
  94. },
  95. {
  96. "valid 3-byte 1-rune input",
  97. 100,
  98. "\u1234",
  99. false,
  100. "\u1234",
  101. nil,
  102. },
  103. {
  104. "valid 5-byte 3-rune input",
  105. 100,
  106. "a\u0100\u0101",
  107. false,
  108. "a\u0100\u0101",
  109. nil,
  110. },
  111. {
  112. "perfectly sized dst (non-ASCII)",
  113. 5,
  114. "a\u0100\u0101",
  115. false,
  116. "a\u0100\u0101",
  117. nil,
  118. },
  119. {
  120. "short dst (non-ASCII)",
  121. 4,
  122. "a\u0100\u0101",
  123. false,
  124. "a\u0100",
  125. transform.ErrShortDst,
  126. },
  127. {
  128. "perfectly sized dst (ASCII)",
  129. 5,
  130. "abcde",
  131. false,
  132. "abcde",
  133. nil,
  134. },
  135. {
  136. "short dst (ASCII)",
  137. 4,
  138. "abcde",
  139. false,
  140. "abcd",
  141. transform.ErrShortDst,
  142. },
  143. {
  144. "partial input (!EOF)",
  145. 100,
  146. "a\u0100\xf1",
  147. false,
  148. "a\u0100",
  149. transform.ErrShortSrc,
  150. },
  151. {
  152. "invalid input (EOF)",
  153. 100,
  154. "a\u0100\xf1",
  155. true,
  156. "a\u0100",
  157. encoding.ErrInvalidUTF8,
  158. },
  159. {
  160. "invalid input (!EOF)",
  161. 100,
  162. "a\u0100\x80",
  163. false,
  164. "a\u0100",
  165. encoding.ErrInvalidUTF8,
  166. },
  167. {
  168. "invalid input (above U+10FFFF)",
  169. 100,
  170. "a\u0100\xf7\xbf\xbf\xbf",
  171. false,
  172. "a\u0100",
  173. encoding.ErrInvalidUTF8,
  174. },
  175. {
  176. "invalid input (surrogate half)",
  177. 100,
  178. "a\u0100\xed\xa0\x80",
  179. false,
  180. "a\u0100",
  181. encoding.ErrInvalidUTF8,
  182. },
  183. }
  184. for _, tc := range testCases {
  185. dst := make([]byte, tc.dstSize)
  186. nDst, nSrc, err := encoding.UTF8Validator.Transform(dst, []byte(tc.src), tc.atEOF)
  187. if nDst < 0 || len(dst) < nDst {
  188. t.Errorf("%s: nDst=%d out of range", tc.desc, nDst)
  189. continue
  190. }
  191. got := string(dst[:nDst])
  192. if got != tc.want || nSrc != len(tc.want) || err != tc.wantErr {
  193. t.Errorf("%s:\ngot %+q, %d, %v\nwant %+q, %d, %v",
  194. tc.desc, got, nSrc, err, tc.want, len(tc.want), tc.wantErr)
  195. continue
  196. }
  197. }
  198. }
  199. func TestErrorHandler(t *testing.T) {
  200. testCases := []struct {
  201. desc string
  202. handler func(*encoding.Encoder) *encoding.Encoder
  203. sizeDst int
  204. src, want string
  205. nSrc int
  206. err error
  207. }{
  208. {
  209. desc: "one rune replacement",
  210. handler: encoding.ReplaceUnsupported,
  211. sizeDst: 100,
  212. src: "\uAC00",
  213. want: "\x1a",
  214. nSrc: 3,
  215. },
  216. {
  217. desc: "mid-stream rune replacement",
  218. handler: encoding.ReplaceUnsupported,
  219. sizeDst: 100,
  220. src: "a\uAC00bcd\u00e9",
  221. want: "a\x1abcd\xe9",
  222. nSrc: 9,
  223. },
  224. {
  225. desc: "at end rune replacement",
  226. handler: encoding.ReplaceUnsupported,
  227. sizeDst: 10,
  228. src: "\u00e9\uAC00",
  229. want: "\xe9\x1a",
  230. nSrc: 5,
  231. },
  232. {
  233. desc: "short buffer replacement",
  234. handler: encoding.ReplaceUnsupported,
  235. sizeDst: 1,
  236. src: "\u00e9\uAC00",
  237. want: "\xe9",
  238. nSrc: 2,
  239. err: transform.ErrShortDst,
  240. },
  241. {
  242. desc: "one rune html escape",
  243. handler: encoding.HTMLEscapeUnsupported,
  244. sizeDst: 100,
  245. src: "\uAC00",
  246. want: "&#44032;",
  247. nSrc: 3,
  248. },
  249. {
  250. desc: "mid-stream html escape",
  251. handler: encoding.HTMLEscapeUnsupported,
  252. sizeDst: 100,
  253. src: "\u00e9\uAC00dcba",
  254. want: "\xe9&#44032;dcba",
  255. nSrc: 9,
  256. },
  257. {
  258. desc: "short buffer html escape",
  259. handler: encoding.HTMLEscapeUnsupported,
  260. sizeDst: 9,
  261. src: "ab\uAC01",
  262. want: "ab",
  263. nSrc: 2,
  264. err: transform.ErrShortDst,
  265. },
  266. }
  267. for i, tc := range testCases {
  268. tr := tc.handler(charmap.Windows1250.NewEncoder())
  269. b := make([]byte, tc.sizeDst)
  270. nDst, nSrc, err := tr.Transform(b, []byte(tc.src), true)
  271. if err != tc.err {
  272. t.Errorf("%d:%s: error was %v; want %v", i, tc.desc, err, tc.err)
  273. }
  274. if got := string(b[:nDst]); got != tc.want {
  275. t.Errorf("%d:%s: result was %q: want %q", i, tc.desc, got, tc.want)
  276. }
  277. if nSrc != tc.nSrc {
  278. t.Errorf("%d:%s: nSrc was %d; want %d", i, tc.desc, nSrc, tc.nSrc)
  279. }
  280. }
  281. }