reader.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465
  1. #include "yaml_private.h"
  2. /*
  3. * Declarations.
  4. */
  5. static int
  6. yaml_parser_set_reader_error(yaml_parser_t *parser, const char *problem,
  7. size_t offset, int value);
  8. static int
  9. yaml_parser_update_raw_buffer(yaml_parser_t *parser);
  10. static int
  11. yaml_parser_determine_encoding(yaml_parser_t *parser);
  12. YAML_DECLARE(int)
  13. yaml_parser_update_buffer(yaml_parser_t *parser, size_t length);
  14. /*
  15. * Set the reader error and return 0.
  16. */
  17. static int
  18. yaml_parser_set_reader_error(yaml_parser_t *parser, const char *problem,
  19. size_t offset, int value)
  20. {
  21. parser->error = YAML_READER_ERROR;
  22. parser->problem = problem;
  23. parser->problem_offset = offset;
  24. parser->problem_value = value;
  25. return 0;
  26. }
  27. /*
  28. * Byte order marks.
  29. */
  30. #define BOM_UTF8 "\xef\xbb\xbf"
  31. #define BOM_UTF16LE "\xff\xfe"
  32. #define BOM_UTF16BE "\xfe\xff"
  33. /*
  34. * Determine the input stream encoding by checking the BOM symbol. If no BOM is
  35. * found, the UTF-8 encoding is assumed. Return 1 on success, 0 on failure.
  36. */
  37. static int
  38. yaml_parser_determine_encoding(yaml_parser_t *parser)
  39. {
  40. /* Ensure that we had enough bytes in the raw buffer. */
  41. while (!parser->eof
  42. && parser->raw_buffer.last - parser->raw_buffer.pointer < 3) {
  43. if (!yaml_parser_update_raw_buffer(parser)) {
  44. return 0;
  45. }
  46. }
  47. /* Determine the encoding. */
  48. if (parser->raw_buffer.last - parser->raw_buffer.pointer >= 2
  49. && !memcmp(parser->raw_buffer.pointer, BOM_UTF16LE, 2)) {
  50. parser->encoding = YAML_UTF16LE_ENCODING;
  51. parser->raw_buffer.pointer += 2;
  52. parser->offset += 2;
  53. }
  54. else if (parser->raw_buffer.last - parser->raw_buffer.pointer >= 2
  55. && !memcmp(parser->raw_buffer.pointer, BOM_UTF16BE, 2)) {
  56. parser->encoding = YAML_UTF16BE_ENCODING;
  57. parser->raw_buffer.pointer += 2;
  58. parser->offset += 2;
  59. }
  60. else if (parser->raw_buffer.last - parser->raw_buffer.pointer >= 3
  61. && !memcmp(parser->raw_buffer.pointer, BOM_UTF8, 3)) {
  62. parser->encoding = YAML_UTF8_ENCODING;
  63. parser->raw_buffer.pointer += 3;
  64. parser->offset += 3;
  65. }
  66. else {
  67. parser->encoding = YAML_UTF8_ENCODING;
  68. }
  69. return 1;
  70. }
  71. /*
  72. * Update the raw buffer.
  73. */
  74. static int
  75. yaml_parser_update_raw_buffer(yaml_parser_t *parser)
  76. {
  77. size_t size_read = 0;
  78. /* Return if the raw buffer is full. */
  79. if (parser->raw_buffer.start == parser->raw_buffer.pointer
  80. && parser->raw_buffer.last == parser->raw_buffer.end)
  81. return 1;
  82. /* Return on EOF. */
  83. if (parser->eof) return 1;
  84. /* Move the remaining bytes in the raw buffer to the beginning. */
  85. if (parser->raw_buffer.start < parser->raw_buffer.pointer
  86. && parser->raw_buffer.pointer < parser->raw_buffer.last) {
  87. memmove(parser->raw_buffer.start, parser->raw_buffer.pointer,
  88. parser->raw_buffer.last - parser->raw_buffer.pointer);
  89. }
  90. parser->raw_buffer.last -=
  91. parser->raw_buffer.pointer - parser->raw_buffer.start;
  92. parser->raw_buffer.pointer = parser->raw_buffer.start;
  93. /* Call the read handler to fill the buffer. */
  94. if (!parser->read_handler(parser->read_handler_data, parser->raw_buffer.last,
  95. parser->raw_buffer.end - parser->raw_buffer.last, &size_read)) {
  96. return yaml_parser_set_reader_error(parser, "input error",
  97. parser->offset, -1);
  98. }
  99. parser->raw_buffer.last += size_read;
  100. if (!size_read) {
  101. parser->eof = 1;
  102. }
  103. return 1;
  104. }
  105. /*
  106. * Ensure that the buffer contains at least `length` characters.
  107. * Return 1 on success, 0 on failure.
  108. *
  109. * The length is supposed to be significantly less that the buffer size.
  110. */
  111. YAML_DECLARE(int)
  112. yaml_parser_update_buffer(yaml_parser_t *parser, size_t length)
  113. {
  114. int first = 1;
  115. assert(parser->read_handler); /* Read handler must be set. */
  116. /* If the EOF flag is set and the raw buffer is empty, do nothing. */
  117. if (parser->eof && parser->raw_buffer.pointer == parser->raw_buffer.last)
  118. return 1;
  119. /* Return if the buffer contains enough characters. */
  120. if (parser->unread >= length)
  121. return 1;
  122. /* Determine the input encoding if it is not known yet. */
  123. if (!parser->encoding) {
  124. if (!yaml_parser_determine_encoding(parser))
  125. return 0;
  126. }
  127. /* Move the unread characters to the beginning of the buffer. */
  128. if (parser->buffer.start < parser->buffer.pointer
  129. && parser->buffer.pointer < parser->buffer.last) {
  130. size_t size = parser->buffer.last - parser->buffer.pointer;
  131. memmove(parser->buffer.start, parser->buffer.pointer, size);
  132. parser->buffer.pointer = parser->buffer.start;
  133. parser->buffer.last = parser->buffer.start + size;
  134. }
  135. else if (parser->buffer.pointer == parser->buffer.last) {
  136. parser->buffer.pointer = parser->buffer.start;
  137. parser->buffer.last = parser->buffer.start;
  138. }
  139. /* Fill the buffer until it has enough characters. */
  140. while (parser->unread < length)
  141. {
  142. /* Fill the raw buffer if necessary. */
  143. if (!first || parser->raw_buffer.pointer == parser->raw_buffer.last) {
  144. if (!yaml_parser_update_raw_buffer(parser)) return 0;
  145. }
  146. first = 0;
  147. /* Decode the raw buffer. */
  148. while (parser->raw_buffer.pointer != parser->raw_buffer.last)
  149. {
  150. unsigned int value = 0, value2 = 0;
  151. int incomplete = 0;
  152. unsigned char octet;
  153. unsigned int width = 0;
  154. int low, high;
  155. size_t k;
  156. size_t raw_unread = parser->raw_buffer.last - parser->raw_buffer.pointer;
  157. /* Decode the next character. */
  158. switch (parser->encoding)
  159. {
  160. case YAML_UTF8_ENCODING:
  161. /*
  162. * Decode a UTF-8 character. Check RFC 3629
  163. * (http://www.ietf.org/rfc/rfc3629.txt) for more details.
  164. *
  165. * The following table (taken from the RFC) is used for
  166. * decoding.
  167. *
  168. * Char. number range | UTF-8 octet sequence
  169. * (hexadecimal) | (binary)
  170. * --------------------+------------------------------------
  171. * 0000 0000-0000 007F | 0xxxxxxx
  172. * 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
  173. * 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
  174. * 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  175. *
  176. * Additionally, the characters in the range 0xD800-0xDFFF
  177. * are prohibited as they are reserved for use with UTF-16
  178. * surrogate pairs.
  179. */
  180. /* Determine the length of the UTF-8 sequence. */
  181. octet = parser->raw_buffer.pointer[0];
  182. width = (octet & 0x80) == 0x00 ? 1 :
  183. (octet & 0xE0) == 0xC0 ? 2 :
  184. (octet & 0xF0) == 0xE0 ? 3 :
  185. (octet & 0xF8) == 0xF0 ? 4 : 0;
  186. /* Check if the leading octet is valid. */
  187. if (!width)
  188. return yaml_parser_set_reader_error(parser,
  189. "invalid leading UTF-8 octet",
  190. parser->offset, octet);
  191. /* Check if the raw buffer contains an incomplete character. */
  192. if (width > raw_unread) {
  193. if (parser->eof) {
  194. return yaml_parser_set_reader_error(parser,
  195. "incomplete UTF-8 octet sequence",
  196. parser->offset, -1);
  197. }
  198. incomplete = 1;
  199. break;
  200. }
  201. /* Decode the leading octet. */
  202. value = (octet & 0x80) == 0x00 ? octet & 0x7F :
  203. (octet & 0xE0) == 0xC0 ? octet & 0x1F :
  204. (octet & 0xF0) == 0xE0 ? octet & 0x0F :
  205. (octet & 0xF8) == 0xF0 ? octet & 0x07 : 0;
  206. /* Check and decode the trailing octets. */
  207. for (k = 1; k < width; k ++)
  208. {
  209. octet = parser->raw_buffer.pointer[k];
  210. /* Check if the octet is valid. */
  211. if ((octet & 0xC0) != 0x80)
  212. return yaml_parser_set_reader_error(parser,
  213. "invalid trailing UTF-8 octet",
  214. parser->offset+k, octet);
  215. /* Decode the octet. */
  216. value = (value << 6) + (octet & 0x3F);
  217. }
  218. /* Check the length of the sequence against the value. */
  219. if (!((width == 1) ||
  220. (width == 2 && value >= 0x80) ||
  221. (width == 3 && value >= 0x800) ||
  222. (width == 4 && value >= 0x10000)))
  223. return yaml_parser_set_reader_error(parser,
  224. "invalid length of a UTF-8 sequence",
  225. parser->offset, -1);
  226. /* Check the range of the value. */
  227. if ((value >= 0xD800 && value <= 0xDFFF) || value > 0x10FFFF)
  228. return yaml_parser_set_reader_error(parser,
  229. "invalid Unicode character",
  230. parser->offset, value);
  231. break;
  232. case YAML_UTF16LE_ENCODING:
  233. case YAML_UTF16BE_ENCODING:
  234. low = (parser->encoding == YAML_UTF16LE_ENCODING ? 0 : 1);
  235. high = (parser->encoding == YAML_UTF16LE_ENCODING ? 1 : 0);
  236. /*
  237. * The UTF-16 encoding is not as simple as one might
  238. * naively think. Check RFC 2781
  239. * (http://www.ietf.org/rfc/rfc2781.txt).
  240. *
  241. * Normally, two subsequent bytes describe a Unicode
  242. * character. However a special technique (called a
  243. * surrogate pair) is used for specifying character
  244. * values larger than 0xFFFF.
  245. *
  246. * A surrogate pair consists of two pseudo-characters:
  247. * high surrogate area (0xD800-0xDBFF)
  248. * low surrogate area (0xDC00-0xDFFF)
  249. *
  250. * The following formulas are used for decoding
  251. * and encoding characters using surrogate pairs:
  252. *
  253. * U = U' + 0x10000 (0x01 00 00 <= U <= 0x10 FF FF)
  254. * U' = yyyyyyyyyyxxxxxxxxxx (0 <= U' <= 0x0F FF FF)
  255. * W1 = 110110yyyyyyyyyy
  256. * W2 = 110111xxxxxxxxxx
  257. *
  258. * where U is the character value, W1 is the high surrogate
  259. * area, W2 is the low surrogate area.
  260. */
  261. /* Check for incomplete UTF-16 character. */
  262. if (raw_unread < 2) {
  263. if (parser->eof) {
  264. return yaml_parser_set_reader_error(parser,
  265. "incomplete UTF-16 character",
  266. parser->offset, -1);
  267. }
  268. incomplete = 1;
  269. break;
  270. }
  271. /* Get the character. */
  272. value = parser->raw_buffer.pointer[low]
  273. + (parser->raw_buffer.pointer[high] << 8);
  274. /* Check for unexpected low surrogate area. */
  275. if ((value & 0xFC00) == 0xDC00)
  276. return yaml_parser_set_reader_error(parser,
  277. "unexpected low surrogate area",
  278. parser->offset, value);
  279. /* Check for a high surrogate area. */
  280. if ((value & 0xFC00) == 0xD800) {
  281. width = 4;
  282. /* Check for incomplete surrogate pair. */
  283. if (raw_unread < 4) {
  284. if (parser->eof) {
  285. return yaml_parser_set_reader_error(parser,
  286. "incomplete UTF-16 surrogate pair",
  287. parser->offset, -1);
  288. }
  289. incomplete = 1;
  290. break;
  291. }
  292. /* Get the next character. */
  293. value2 = parser->raw_buffer.pointer[low+2]
  294. + (parser->raw_buffer.pointer[high+2] << 8);
  295. /* Check for a low surrogate area. */
  296. if ((value2 & 0xFC00) != 0xDC00)
  297. return yaml_parser_set_reader_error(parser,
  298. "expected low surrogate area",
  299. parser->offset+2, value2);
  300. /* Generate the value of the surrogate pair. */
  301. value = 0x10000 + ((value & 0x3FF) << 10) + (value2 & 0x3FF);
  302. }
  303. else {
  304. width = 2;
  305. }
  306. break;
  307. default:
  308. assert(1); /* Impossible. */
  309. }
  310. /* Check if the raw buffer contains enough bytes to form a character. */
  311. if (incomplete) break;
  312. /*
  313. * Check if the character is in the allowed range:
  314. * #x9 | #xA | #xD | [#x20-#x7E] (8 bit)
  315. * | #x85 | [#xA0-#xD7FF] | [#xE000-#xFFFD] (16 bit)
  316. * | [#x10000-#x10FFFF] (32 bit)
  317. */
  318. if (! (value == 0x09 || value == 0x0A || value == 0x0D
  319. || (value >= 0x20 && value <= 0x7E)
  320. || (value == 0x85) || (value >= 0xA0 && value <= 0xD7FF)
  321. || (value >= 0xE000 && value <= 0xFFFD)
  322. || (value >= 0x10000 && value <= 0x10FFFF)))
  323. return yaml_parser_set_reader_error(parser,
  324. "control characters are not allowed",
  325. parser->offset, value);
  326. /* Move the raw pointers. */
  327. parser->raw_buffer.pointer += width;
  328. parser->offset += width;
  329. /* Finally put the character into the buffer. */
  330. /* 0000 0000-0000 007F -> 0xxxxxxx */
  331. if (value <= 0x7F) {
  332. *(parser->buffer.last++) = value;
  333. }
  334. /* 0000 0080-0000 07FF -> 110xxxxx 10xxxxxx */
  335. else if (value <= 0x7FF) {
  336. *(parser->buffer.last++) = 0xC0 + (value >> 6);
  337. *(parser->buffer.last++) = 0x80 + (value & 0x3F);
  338. }
  339. /* 0000 0800-0000 FFFF -> 1110xxxx 10xxxxxx 10xxxxxx */
  340. else if (value <= 0xFFFF) {
  341. *(parser->buffer.last++) = 0xE0 + (value >> 12);
  342. *(parser->buffer.last++) = 0x80 + ((value >> 6) & 0x3F);
  343. *(parser->buffer.last++) = 0x80 + (value & 0x3F);
  344. }
  345. /* 0001 0000-0010 FFFF -> 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
  346. else {
  347. *(parser->buffer.last++) = 0xF0 + (value >> 18);
  348. *(parser->buffer.last++) = 0x80 + ((value >> 12) & 0x3F);
  349. *(parser->buffer.last++) = 0x80 + ((value >> 6) & 0x3F);
  350. *(parser->buffer.last++) = 0x80 + (value & 0x3F);
  351. }
  352. parser->unread ++;
  353. }
  354. /* On EOF, put NUL into the buffer and return. */
  355. if (parser->eof) {
  356. *(parser->buffer.last++) = '\0';
  357. parser->unread ++;
  358. return 1;
  359. }
  360. }
  361. return 1;
  362. }