hzgb2312.go 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245
  1. // Copyright 2013 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package simplifiedchinese
  5. import (
  6. "unicode/utf8"
  7. "golang.org/x/text/encoding"
  8. "golang.org/x/text/encoding/internal"
  9. "golang.org/x/text/encoding/internal/identifier"
  10. "golang.org/x/text/transform"
  11. )
  12. // HZGB2312 is the HZ-GB2312 encoding.
  13. var HZGB2312 encoding.Encoding = &hzGB2312
  14. var hzGB2312 = internal.Encoding{
  15. internal.FuncEncoding{hzGB2312NewDecoder, hzGB2312NewEncoder},
  16. "HZ-GB2312",
  17. identifier.HZGB2312,
  18. }
  19. func hzGB2312NewDecoder() transform.Transformer {
  20. return new(hzGB2312Decoder)
  21. }
  22. func hzGB2312NewEncoder() transform.Transformer {
  23. return new(hzGB2312Encoder)
  24. }
  25. const (
  26. asciiState = iota
  27. gbState
  28. )
  29. type hzGB2312Decoder int
  30. func (d *hzGB2312Decoder) Reset() {
  31. *d = asciiState
  32. }
  33. func (d *hzGB2312Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
  34. r, size := rune(0), 0
  35. loop:
  36. for ; nSrc < len(src); nSrc += size {
  37. c0 := src[nSrc]
  38. if c0 >= utf8.RuneSelf {
  39. r, size = utf8.RuneError, 1
  40. goto write
  41. }
  42. if c0 == '~' {
  43. if nSrc+1 >= len(src) {
  44. if !atEOF {
  45. err = transform.ErrShortSrc
  46. break loop
  47. }
  48. r, size = utf8.RuneError, 1
  49. goto write
  50. }
  51. size = 2
  52. switch src[nSrc+1] {
  53. case '{':
  54. *d = gbState
  55. continue
  56. case '}':
  57. *d = asciiState
  58. continue
  59. case '~':
  60. if nDst >= len(dst) {
  61. err = transform.ErrShortDst
  62. break loop
  63. }
  64. dst[nDst] = '~'
  65. nDst++
  66. continue
  67. case '\n':
  68. continue
  69. default:
  70. r = utf8.RuneError
  71. goto write
  72. }
  73. }
  74. if *d == asciiState {
  75. r, size = rune(c0), 1
  76. } else {
  77. if nSrc+1 >= len(src) {
  78. if !atEOF {
  79. err = transform.ErrShortSrc
  80. break loop
  81. }
  82. r, size = utf8.RuneError, 1
  83. goto write
  84. }
  85. size = 2
  86. c1 := src[nSrc+1]
  87. if c0 < 0x21 || 0x7e <= c0 || c1 < 0x21 || 0x7f <= c1 {
  88. // error
  89. } else if i := int(c0-0x01)*190 + int(c1+0x3f); i < len(decode) {
  90. r = rune(decode[i])
  91. if r != 0 {
  92. goto write
  93. }
  94. }
  95. if c1 > utf8.RuneSelf {
  96. // Be consistent and always treat non-ASCII as a single error.
  97. size = 1
  98. }
  99. r = utf8.RuneError
  100. }
  101. write:
  102. if nDst+utf8.RuneLen(r) > len(dst) {
  103. err = transform.ErrShortDst
  104. break loop
  105. }
  106. nDst += utf8.EncodeRune(dst[nDst:], r)
  107. }
  108. return nDst, nSrc, err
  109. }
  110. type hzGB2312Encoder int
  111. func (d *hzGB2312Encoder) Reset() {
  112. *d = asciiState
  113. }
  114. func (e *hzGB2312Encoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
  115. r, size := rune(0), 0
  116. for ; nSrc < len(src); nSrc += size {
  117. r = rune(src[nSrc])
  118. // Decode a 1-byte rune.
  119. if r < utf8.RuneSelf {
  120. size = 1
  121. if r == '~' {
  122. if nDst+2 > len(dst) {
  123. err = transform.ErrShortDst
  124. break
  125. }
  126. dst[nDst+0] = '~'
  127. dst[nDst+1] = '~'
  128. nDst += 2
  129. continue
  130. } else if *e != asciiState {
  131. if nDst+3 > len(dst) {
  132. err = transform.ErrShortDst
  133. break
  134. }
  135. *e = asciiState
  136. dst[nDst+0] = '~'
  137. dst[nDst+1] = '}'
  138. nDst += 2
  139. } else if nDst >= len(dst) {
  140. err = transform.ErrShortDst
  141. break
  142. }
  143. dst[nDst] = uint8(r)
  144. nDst += 1
  145. continue
  146. }
  147. // Decode a multi-byte rune.
  148. r, size = utf8.DecodeRune(src[nSrc:])
  149. if size == 1 {
  150. // All valid runes of size 1 (those below utf8.RuneSelf) were
  151. // handled above. We have invalid UTF-8 or we haven't seen the
  152. // full character yet.
  153. if !atEOF && !utf8.FullRune(src[nSrc:]) {
  154. err = transform.ErrShortSrc
  155. break
  156. }
  157. }
  158. // func init checks that the switch covers all tables.
  159. switch {
  160. case encode0Low <= r && r < encode0High:
  161. if r = rune(encode0[r-encode0Low]); r != 0 {
  162. goto writeGB
  163. }
  164. case encode1Low <= r && r < encode1High:
  165. if r = rune(encode1[r-encode1Low]); r != 0 {
  166. goto writeGB
  167. }
  168. case encode2Low <= r && r < encode2High:
  169. if r = rune(encode2[r-encode2Low]); r != 0 {
  170. goto writeGB
  171. }
  172. case encode3Low <= r && r < encode3High:
  173. if r = rune(encode3[r-encode3Low]); r != 0 {
  174. goto writeGB
  175. }
  176. case encode4Low <= r && r < encode4High:
  177. if r = rune(encode4[r-encode4Low]); r != 0 {
  178. goto writeGB
  179. }
  180. }
  181. terminateInASCIIState:
  182. // Switch back to ASCII state in case of error so that an ASCII
  183. // replacement character can be written in the correct state.
  184. if *e != asciiState {
  185. if nDst+2 > len(dst) {
  186. err = transform.ErrShortDst
  187. break
  188. }
  189. dst[nDst+0] = '~'
  190. dst[nDst+1] = '}'
  191. nDst += 2
  192. }
  193. err = internal.ErrASCIIReplacement
  194. break
  195. writeGB:
  196. c0 := uint8(r>>8) - 0x80
  197. c1 := uint8(r) - 0x80
  198. if c0 < 0x21 || 0x7e <= c0 || c1 < 0x21 || 0x7f <= c1 {
  199. goto terminateInASCIIState
  200. }
  201. if *e == asciiState {
  202. if nDst+4 > len(dst) {
  203. err = transform.ErrShortDst
  204. break
  205. }
  206. *e = gbState
  207. dst[nDst+0] = '~'
  208. dst[nDst+1] = '{'
  209. nDst += 2
  210. } else if nDst+2 > len(dst) {
  211. err = transform.ErrShortDst
  212. break
  213. }
  214. dst[nDst+0] = c0
  215. dst[nDst+1] = c1
  216. nDst += 2
  217. continue
  218. }
  219. // TODO: should one always terminate in ASCII state to make it safe to
  220. // concatenate two HZ-GB2312-encoded strings?
  221. return nDst, nSrc, err
  222. }