reader_c.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397
  1. package goyaml
  2. import (
  3. "io"
  4. )
  5. // Set the reader error and return 0.
  6. func yaml_parser_set_reader_error(parser *yaml_parser_t, problem string, offset int, value int) bool {
  7. parser.error = yaml_READER_ERROR
  8. parser.problem = problem
  9. parser.problem_offset = offset
  10. parser.problem_value = value
  11. return false
  12. }
  13. // Byte order marks.
  14. const (
  15. bom_UTF8 = "\xef\xbb\xbf"
  16. bom_UTF16LE = "\xff\xfe"
  17. bom_UTF16BE = "\xfe\xff"
  18. )
  19. // Determine the input stream encoding by checking the BOM symbol. If no BOM is
  20. // found, the UTF-8 encoding is assumed. Return 1 on success, 0 on failure.
  21. func yaml_parser_determine_encoding(parser *yaml_parser_t) bool {
  22. // Ensure that we had enough bytes in the raw buffer.
  23. for !parser.eof && len(parser.raw_buffer)-parser.raw_buffer_pos < 3 {
  24. if !yaml_parser_update_raw_buffer(parser) {
  25. return false
  26. }
  27. }
  28. // Determine the encoding.
  29. buf := parser.raw_buffer
  30. pos := parser.raw_buffer_pos
  31. avail := len(buf) - pos
  32. if avail >= 2 && buf[pos] == bom_UTF16LE[0] && buf[pos+1] == bom_UTF16LE[1] {
  33. parser.encoding = yaml_UTF16LE_ENCODING
  34. parser.raw_buffer_pos += 2
  35. parser.offset += 2
  36. } else if avail >= 2 && buf[pos] == bom_UTF16BE[0] && buf[pos+1] == bom_UTF16BE[1] {
  37. parser.encoding = yaml_UTF16BE_ENCODING
  38. parser.raw_buffer_pos += 2
  39. parser.offset += 2
  40. } else if avail >= 3 && buf[pos] == bom_UTF8[0] && buf[pos+1] == bom_UTF8[1] && buf[pos+2] == bom_UTF8[2] {
  41. parser.encoding = yaml_UTF8_ENCODING
  42. parser.raw_buffer_pos += 3
  43. parser.offset += 3
  44. } else {
  45. parser.encoding = yaml_UTF8_ENCODING
  46. }
  47. return true
  48. }
  49. // Update the raw buffer.
  50. func yaml_parser_update_raw_buffer(parser *yaml_parser_t) bool {
  51. size_read := 0
  52. // Return if the raw buffer is full.
  53. if parser.raw_buffer_pos == 0 && len(parser.raw_buffer) == cap(parser.raw_buffer) {
  54. return true
  55. }
  56. // Return on EOF.
  57. if parser.eof {
  58. return true
  59. }
  60. // Move the remaining bytes in the raw buffer to the beginning.
  61. if parser.raw_buffer_pos > 0 && parser.raw_buffer_pos < len(parser.raw_buffer) {
  62. copy(parser.raw_buffer, parser.raw_buffer[parser.raw_buffer_pos:])
  63. }
  64. parser.raw_buffer = parser.raw_buffer[:len(parser.raw_buffer)-parser.raw_buffer_pos]
  65. parser.raw_buffer_pos = 0
  66. // Call the read handler to fill the buffer.
  67. size_read, err := parser.read_handler(parser, parser.raw_buffer[len(parser.raw_buffer):cap(parser.raw_buffer)])
  68. parser.raw_buffer = parser.raw_buffer[:len(parser.raw_buffer)+size_read]
  69. if err == io.EOF {
  70. parser.eof = true
  71. } else if err != nil {
  72. return yaml_parser_set_reader_error(parser, "input error: "+err.Error(), parser.offset, -1)
  73. }
  74. return true
  75. }
  76. // Ensure that the buffer contains at least `length` characters.
  77. // Return true on success, false on failure.
  78. //
  79. // The length is supposed to be significantly less that the buffer size.
  80. func yaml_parser_update_buffer(parser *yaml_parser_t, length int) bool {
  81. if parser.read_handler == nil {
  82. panic("read handler must be set")
  83. }
  84. // If the EOF flag is set and the raw buffer is empty, do nothing.
  85. if parser.eof && parser.raw_buffer_pos == len(parser.raw_buffer) {
  86. return true
  87. }
  88. // Return if the buffer contains enough characters.
  89. if parser.unread >= length {
  90. return true
  91. }
  92. // Determine the input encoding if it is not known yet.
  93. if parser.encoding == yaml_ANY_ENCODING {
  94. if !yaml_parser_determine_encoding(parser) {
  95. return false
  96. }
  97. }
  98. // Move the unread characters to the beginning of the buffer.
  99. if parser.buffer_pos > 0 && parser.buffer_pos < len(parser.buffer) {
  100. size := len(parser.buffer) - parser.buffer_pos
  101. copy(parser.buffer, parser.buffer[parser.buffer_pos:])
  102. parser.buffer_pos = 0
  103. parser.buffer = parser.buffer[:size]
  104. } else if parser.buffer_pos == len(parser.buffer) {
  105. parser.buffer_pos = 0
  106. parser.buffer = parser.buffer[:0]
  107. }
  108. // Fill the buffer until it has enough characters.
  109. first := true
  110. for parser.unread < length {
  111. // Fill the raw buffer if necessary.
  112. if !first || parser.raw_buffer_pos == len(parser.raw_buffer) {
  113. if !yaml_parser_update_raw_buffer(parser) {
  114. return false
  115. }
  116. }
  117. first = false
  118. // Decode the raw buffer.
  119. for parser.raw_buffer_pos != len(parser.raw_buffer) {
  120. var value, value2 rune
  121. var incomplete bool
  122. var width int
  123. raw_unread := len(parser.raw_buffer) - parser.raw_buffer_pos
  124. // Decode the next character.
  125. switch parser.encoding {
  126. case yaml_UTF8_ENCODING:
  127. // Decode a UTF-8 character. Check RFC 3629
  128. // (http://www.ietf.org/rfc/rfc3629.txt) for more details.
  129. //
  130. // The following table (taken from the RFC) is used for
  131. // decoding.
  132. //
  133. // Char. number range | UTF-8 octet sequence
  134. // (hexadecimal) | (binary)
  135. // --------------------+------------------------------------
  136. // 0000 0000-0000 007F | 0xxxxxxx
  137. // 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
  138. // 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
  139. // 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  140. //
  141. // Additionally, the characters in the range 0xD800-0xDFFF
  142. // are prohibited as they are reserved for use with UTF-16
  143. // surrogate pairs.
  144. // Determine the length of the UTF-8 sequence.
  145. octet := parser.raw_buffer[parser.raw_buffer_pos]
  146. switch {
  147. case octet&0x80 == 0x00:
  148. width = 1
  149. case octet&0xE0 == 0xC0:
  150. width = 2
  151. case octet&0xF0 == 0xE0:
  152. width = 3
  153. case octet&0xF8 == 0xF0:
  154. width = 4
  155. default:
  156. // The leading octet is invalid.
  157. return yaml_parser_set_reader_error(parser,
  158. "invalid leading UTF-8 octet",
  159. parser.offset, int(octet))
  160. }
  161. // Check if the raw buffer contains an incomplete character.
  162. if width > raw_unread {
  163. if parser.eof {
  164. return yaml_parser_set_reader_error(parser,
  165. "incomplete UTF-8 octet sequence",
  166. parser.offset, -1)
  167. }
  168. incomplete = true
  169. break
  170. }
  171. // Decode the leading octet.
  172. switch {
  173. case octet&0x80 == 0x00:
  174. value = rune(octet & 0x7F)
  175. case octet&0xE0 == 0xC0:
  176. value = rune(octet & 0x1F)
  177. case octet&0xF0 == 0xE0:
  178. value = rune(octet & 0x0F)
  179. case octet&0xF8 == 0xF0:
  180. value = rune(octet & 0x07)
  181. default:
  182. value = 0
  183. }
  184. // Check and decode the trailing octets.
  185. for k := 1; k < width; k++ {
  186. octet = parser.raw_buffer[parser.raw_buffer_pos+k]
  187. // Check if the octet is valid.
  188. if (octet & 0xC0) != 0x80 {
  189. return yaml_parser_set_reader_error(parser,
  190. "invalid trailing UTF-8 octet",
  191. parser.offset+k, int(octet))
  192. }
  193. // Decode the octet.
  194. value = (value << 6) + rune(octet&0x3F)
  195. }
  196. // Check the length of the sequence against the value.
  197. switch {
  198. case width == 1:
  199. case width == 2 && value >= 0x80:
  200. case width == 3 && value >= 0x800:
  201. case width == 4 && value >= 0x10000:
  202. default:
  203. return yaml_parser_set_reader_error(parser,
  204. "invalid length of a UTF-8 sequence",
  205. parser.offset, -1)
  206. }
  207. // Check the range of the value.
  208. if value >= 0xD800 && value <= 0xDFFF || value > 0x10FFFF {
  209. return yaml_parser_set_reader_error(parser,
  210. "invalid Unicode character",
  211. parser.offset, int(value))
  212. }
  213. case yaml_UTF16LE_ENCODING, yaml_UTF16BE_ENCODING:
  214. var low, high int
  215. if parser.encoding == yaml_UTF16LE_ENCODING {
  216. low, high = 0, 1
  217. } else {
  218. high, low = 1, 0
  219. }
  220. // The UTF-16 encoding is not as simple as one might
  221. // naively think. Check RFC 2781
  222. // (http://www.ietf.org/rfc/rfc2781.txt).
  223. //
  224. // Normally, two subsequent bytes describe a Unicode
  225. // character. However a special technique (called a
  226. // surrogate pair) is used for specifying character
  227. // values larger than 0xFFFF.
  228. //
  229. // A surrogate pair consists of two pseudo-characters:
  230. // high surrogate area (0xD800-0xDBFF)
  231. // low surrogate area (0xDC00-0xDFFF)
  232. //
  233. // The following formulas are used for decoding
  234. // and encoding characters using surrogate pairs:
  235. //
  236. // U = U' + 0x10000 (0x01 00 00 <= U <= 0x10 FF FF)
  237. // U' = yyyyyyyyyyxxxxxxxxxx (0 <= U' <= 0x0F FF FF)
  238. // W1 = 110110yyyyyyyyyy
  239. // W2 = 110111xxxxxxxxxx
  240. //
  241. // where U is the character value, W1 is the high surrogate
  242. // area, W2 is the low surrogate area.
  243. // Check for incomplete UTF-16 character.
  244. if raw_unread < 2 {
  245. if parser.eof {
  246. return yaml_parser_set_reader_error(parser,
  247. "incomplete UTF-16 character",
  248. parser.offset, -1)
  249. }
  250. incomplete = true
  251. break
  252. }
  253. // Get the character.
  254. value = rune(parser.raw_buffer[parser.raw_buffer_pos+low]) +
  255. (rune(parser.raw_buffer[parser.raw_buffer_pos+high]) << 8)
  256. // Check for unexpected low surrogate area.
  257. if value&0xFC00 == 0xDC00 {
  258. return yaml_parser_set_reader_error(parser,
  259. "unexpected low surrogate area",
  260. parser.offset, int(value))
  261. }
  262. // Check for a high surrogate area.
  263. if value&0xFC00 == 0xD800 {
  264. width = 4
  265. // Check for incomplete surrogate pair.
  266. if raw_unread < 4 {
  267. if parser.eof {
  268. return yaml_parser_set_reader_error(parser,
  269. "incomplete UTF-16 surrogate pair",
  270. parser.offset, -1)
  271. }
  272. incomplete = true
  273. break
  274. }
  275. // Get the next character.
  276. value2 = rune(parser.raw_buffer[parser.raw_buffer_pos+low+2]) +
  277. (rune(parser.raw_buffer[parser.raw_buffer_pos+high+2]) << 8)
  278. // Check for a low surrogate area.
  279. if value2&0xFC00 != 0xDC00 {
  280. return yaml_parser_set_reader_error(parser,
  281. "expected low surrogate area",
  282. parser.offset+2, int(value2))
  283. }
  284. // Generate the value of the surrogate pair.
  285. value = 0x10000 + ((value & 0x3FF) << 10) + (value2 & 0x3FF)
  286. } else {
  287. width = 2
  288. }
  289. default:
  290. panic("impossible")
  291. }
  292. // Check if the raw buffer contains enough bytes to form a character.
  293. if incomplete {
  294. break
  295. }
  296. // Check if the character is in the allowed range:
  297. // #x9 | #xA | #xD | [#x20-#x7E] (8 bit)
  298. // | #x85 | [#xA0-#xD7FF] | [#xE000-#xFFFD] (16 bit)
  299. // | [#x10000-#x10FFFF] (32 bit)
  300. switch {
  301. case value == 0x09:
  302. case value == 0x0A:
  303. case value == 0x0D:
  304. case value >= 0x20 && value <= 0x7E:
  305. case value == 0x85:
  306. case value >= 0xA0 && value <= 0xD7FF:
  307. case value >= 0xE000 && value <= 0xFFFD:
  308. case value >= 0x10000 && value <= 0x10FFFF:
  309. default:
  310. return yaml_parser_set_reader_error(parser,
  311. "control characters are not allowed",
  312. parser.offset, int(value))
  313. }
  314. // Move the raw pointers.
  315. parser.raw_buffer_pos += width
  316. parser.offset += width
  317. pos := len(parser.buffer)
  318. parser.buffer = parser.buffer[:pos+width]
  319. // Finally put the character into the buffer.
  320. if value <= 0x7F {
  321. // 0000 0000-0000 007F . 0xxxxxxx
  322. parser.buffer[pos+0] = byte(value)
  323. } else if value <= 0x7FF {
  324. // 0000 0080-0000 07FF . 110xxxxx 10xxxxxx
  325. parser.buffer[pos+0] = byte(0xC0 + (value >> 6))
  326. parser.buffer[pos+1] = byte(0x80 + (value & 0x3F))
  327. } else if value <= 0xFFFF {
  328. // 0000 0800-0000 FFFF . 1110xxxx 10xxxxxx 10xxxxxx
  329. parser.buffer[pos+0] = byte(0xE0 + (value >> 12))
  330. parser.buffer[pos+1] = byte(0x80 + ((value >> 6) & 0x3F))
  331. parser.buffer[pos+2] = byte(0x80 + (value & 0x3F))
  332. } else {
  333. // 0001 0000-0010 FFFF . 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  334. parser.buffer[pos+0] = byte(0xF0 + (value >> 18))
  335. parser.buffer[pos+1] = byte(0x80 + ((value >> 12) & 0x3F))
  336. parser.buffer[pos+2] = byte(0x80 + ((value >> 6) & 0x3F))
  337. parser.buffer[pos+3] = byte(0x80 + (value & 0x3F))
  338. }
  339. parser.unread++
  340. }
  341. // On EOF, put NUL into the buffer and return.
  342. if parser.eof {
  343. parser.buffer = parser.buffer[:len(parser.buffer)+1]
  344. parser.buffer[len(parser.buffer)-1] = 0x00
  345. parser.unread++
  346. return true
  347. }
  348. }
  349. return true
  350. }