Ticket #138: patch
| File patch, 36.3 KB (added by spitzak@…, 4 years ago) |
|---|
-
api.c
diff -ur original_src/api.c src/api.c
old new 611 611 } 612 612 613 613 /* 614 * Check if a string is a valid UTF-8 sequence.615 *616 * Check 'reader.c' for more details on UTF-8 encoding.617 */618 619 static int620 yaml_check_utf8(yaml_char_t *start, size_t length)621 {622 yaml_char_t *end = start+length;623 yaml_char_t *pointer = start;624 625 while (pointer < end) {626 unsigned char octet;627 unsigned int width;628 unsigned int value;629 size_t k;630 631 octet = pointer[0];632 width = (octet & 0x80) == 0x00 ? 1 :633 (octet & 0xE0) == 0xC0 ? 2 :634 (octet & 0xF0) == 0xE0 ? 3 :635 (octet & 0xF8) == 0xF0 ? 4 : 0;636 value = (octet & 0x80) == 0x00 ? octet & 0x7F :637 (octet & 0xE0) == 0xC0 ? octet & 0x1F :638 (octet & 0xF0) == 0xE0 ? octet & 0x0F :639 (octet & 0xF8) == 0xF0 ? octet & 0x07 : 0;640 if (!width) return 0;641 if (pointer+width > end) return 0;642 for (k = 1; k < width; k ++) {643 octet = pointer[k];644 if ((octet & 0xC0) != 0x80) return 0;645 value = (value << 6) + (octet & 0x3F);646 }647 if (!((width == 1) ||648 (width == 2 && value >= 0x80) ||649 (width == 3 && value >= 0x800) ||650 (width == 4 && value >= 0x10000))) return 0;651 652 pointer += width;653 }654 655 return 1;656 }657 658 /*659 614 * Create STREAM-START. 660 615 */ 661 616 … … 731 686 tag_directive != tag_directives_end; tag_directive ++) { 732 687 assert(tag_directive->handle); 733 688 assert(tag_directive->prefix); 734 if (!yaml_check_utf8(tag_directive->handle,735 strlen((char *)tag_directive->handle)))736 goto error;737 if (!yaml_check_utf8(tag_directive->prefix,738 strlen((char *)tag_directive->prefix)))739 goto error;740 689 value.handle = yaml_strdup(tag_directive->handle); 741 690 value.prefix = yaml_strdup(tag_directive->prefix); 742 691 if (!value.handle || !value.prefix) goto error; … … 796 745 assert(event); /* Non-NULL event object is expected. */ 797 746 assert(anchor); /* Non-NULL anchor is expected. */ 798 747 799 if (!yaml_check_utf8(anchor, strlen((char *)anchor))) return 0;800 801 748 anchor_copy = yaml_strdup(anchor); 802 749 if (!anchor_copy) 803 750 return 0; … … 827 774 assert(value); /* Non-NULL anchor is expected. */ 828 775 829 776 if (anchor) { 830 if (!yaml_check_utf8(anchor, strlen((char *)anchor))) goto error;831 777 anchor_copy = yaml_strdup(anchor); 832 778 if (!anchor_copy) goto error; 833 779 } 834 780 835 781 if (tag) { 836 if (!yaml_check_utf8(tag, strlen((char *)tag))) goto error;837 782 tag_copy = yaml_strdup(tag); 838 783 if (!tag_copy) goto error; 839 784 } … … 842 787 length = strlen((char *)value); 843 788 } 844 789 845 if (!yaml_check_utf8(value, length)) goto error;846 790 value_copy = yaml_malloc(length+1); 847 791 if (!value_copy) goto error; 848 792 memcpy(value_copy, value, length); … … 877 821 assert(event); /* Non-NULL event object is expected. */ 878 822 879 823 if (anchor) { 880 if (!yaml_check_utf8(anchor, strlen((char *)anchor))) goto error;881 824 anchor_copy = yaml_strdup(anchor); 882 825 if (!anchor_copy) goto error; 883 826 } 884 827 885 828 if (tag) { 886 if (!yaml_check_utf8(tag, strlen((char *)tag))) goto error;887 829 tag_copy = yaml_strdup(tag); 888 830 if (!tag_copy) goto error; 889 831 } … … 932 874 assert(event); /* Non-NULL event object is expected. */ 933 875 934 876 if (anchor) { 935 if (!yaml_check_utf8(anchor, strlen((char *)anchor))) goto error;936 877 anchor_copy = yaml_strdup(anchor); 937 878 if (!anchor_copy) goto error; 938 879 } 939 880 940 881 if (tag) { 941 if (!yaml_check_utf8(tag, strlen((char *)tag))) goto error;942 882 tag_copy = yaml_strdup(tag); 943 883 if (!tag_copy) goto error; 944 884 } … … 1072 1012 tag_directive != tag_directives_end; tag_directive ++) { 1073 1013 assert(tag_directive->handle); 1074 1014 assert(tag_directive->prefix); 1075 if (!yaml_check_utf8(tag_directive->handle,1076 strlen((char *)tag_directive->handle)))1077 goto error;1078 if (!yaml_check_utf8(tag_directive->prefix,1079 strlen((char *)tag_directive->prefix)))1080 goto error;1081 1015 value.handle = yaml_strdup(tag_directive->handle); 1082 1016 value.prefix = yaml_strdup(tag_directive->prefix); 1083 1017 if (!value.handle || !value.prefix) goto error; … … 1210 1144 tag = (yaml_char_t *)YAML_DEFAULT_SCALAR_TAG; 1211 1145 } 1212 1146 1213 if (!yaml_check_utf8(tag, strlen((char *)tag))) goto error;1214 1147 tag_copy = yaml_strdup(tag); 1215 1148 if (!tag_copy) goto error; 1216 1149 … … 1218 1151 length = strlen((char *)value); 1219 1152 } 1220 1153 1221 if (!yaml_check_utf8(value, length)) goto error;1222 1154 value_copy = yaml_malloc(length+1); 1223 1155 if (!value_copy) goto error; 1224 1156 memcpy(value_copy, value, length); … … 1262 1194 tag = (yaml_char_t *)YAML_DEFAULT_SEQUENCE_TAG; 1263 1195 } 1264 1196 1265 if (!yaml_check_utf8(tag, strlen((char *)tag))) goto error;1266 1197 tag_copy = yaml_strdup(tag); 1267 1198 if (!tag_copy) goto error; 1268 1199 … … 1307 1238 tag = (yaml_char_t *)YAML_DEFAULT_MAPPING_TAG; 1308 1239 } 1309 1240 1310 if (!yaml_check_utf8(tag, strlen((char *)tag))) goto error;1311 1241 tag_copy = yaml_strdup(tag); 1312 1242 if (!tag_copy) goto error; 1313 1243 -
emitter.c
diff -ur original_src/emitter.c src/emitter.c
old new 37 37 1)) 38 38 39 39 /* 40 * Copy a characterfrom a string into buffer.40 * Copy a byte from a string into buffer. 41 41 */ 42 42 43 43 #define WRITE(emitter,string) \ … … 46 46 emitter->column ++, \ 47 47 1)) 48 48 49 #define WRITEN(emitter,string,n) \ 50 (FLUSH(emitter) \ 51 && (COPYN(emitter->buffer,string,n), \ 52 emitter->column ++, \ 53 1)) 54 49 55 /* 50 56 * Copy a line break character from a string into buffer. 51 57 */ … … 56 62 (PUT_BREAK(emitter), \ 57 63 string.pointer ++, \ 58 64 1) : \ 59 (COPY (emitter->buffer,string),\65 (COPYN(emitter->buffer,string,WIDTH(string)), \ 60 66 emitter->column = 0, \ 61 67 emitter->line ++, \ 62 68 1))) … … 1458 1464 return 1; 1459 1465 } 1460 1466 1467 /** 1468 Return the number of bytes in the next UTF-8 character. 1469 1470 Returns 0 for any error, includinge incorrect bytes, not enough 1471 bytes before the end pointer, and overlong encodings. If 0 is 1472 returned, pointer[0] will always have the high bit set and will 1473 thus never match any ASCII character. 1474 1475 The encodings of surrogate halves are allowed! Otherwise it is not 1476 possible to losslessly encode invalid UTF-16 into UTF-8. (There is 1477 a vocal contingent trying to sabotage UTF-8 by declaring surrogate 1478 half encodings invalid. Anybody such claim should be investigated 1479 carefully: if that user's code does not also reject invalid UTF-16, 1480 then they are being hypocrites and can be ignored.) 1481 */ 1482 static unsigned int utf8_width(yaml_char_t* pointer, yaml_char_t* end) 1483 { 1484 unsigned char octet = pointer[0]; 1485 if (octet < 0x80) { 1486 /* 1-byte character */ 1487 return 1; 1488 } else if (octet < 0xC2) { 1489 /* continuation byte or overlong 2-byte encoding */ 1490 return 0; 1491 } else if (octet < 0xE0) { 1492 /* 2-byte character */ 1493 if (end-pointer < 2) return 0; 1494 if ((pointer[1] & 0xC0) != 0x80) return 0; 1495 return 2; 1496 } else if (octet < 0xF0) { 1497 /* 3-byte character */ 1498 if (end-pointer < 3) return 0; 1499 if (octet == 0xE0 && pointer[1] < 0xA0) return 0; /* overlong */ 1500 if ((pointer[1] & 0xC0) != 0x80) return 0; 1501 if ((pointer[2] & 0xC0) != 0x80) return 0; 1502 return 3; 1503 } else if (octet < 0xF5) { 1504 /* 4-byte character */ 1505 if (end-pointer < 4) return 0; 1506 if (octet == 0xF0 && pointer[1] < 0x90) return 0; /* overlong */ 1507 if (octet == 0xF4 && pointer[1] > 0x8F) return 0; /* > 0x10FFFF */ 1508 if ((pointer[1] & 0xC0) != 0x80) return 0; 1509 if ((pointer[2] & 0xC0) != 0x80) return 0; 1510 if ((pointer[3] & 0xC0) != 0x80) return 0; 1511 return 4; 1512 } else { 1513 /* can never appear in UTF-8 */ 1514 return 0; 1515 } 1516 } 1517 1461 1518 /* 1462 1519 * Check if a scalar is valid. 1463 1520 */ … … 1481 1538 int space_break = 0; 1482 1539 1483 1540 int preceeded_by_whitespace = 0; 1484 int followed_by_whitespace = 0;1485 1541 int previous_space = 0; 1486 1542 int previous_break = 0; 1487 1543 … … 1510 1566 } 1511 1567 1512 1568 preceeded_by_whitespace = 1; 1513 followed_by_whitespace = IS_BLANKZ_AT(string, WIDTH(string));1514 1569 1515 1570 while (string.pointer != string.end) 1516 1571 { 1572 unsigned int width = utf8_width(string.pointer, string.end); 1573 1574 if (!width) { 1575 special_characters = 1; 1576 string.pointer++; 1577 continue; 1578 } 1579 1517 1580 if (string.start == string.pointer) 1518 1581 { 1519 1582 if (CHECK(string, '#') || CHECK(string, ',') … … 1530 1593 1531 1594 if (CHECK(string, '?') || CHECK(string, ':')) { 1532 1595 flow_indicators = 1; 1533 if ( followed_by_whitespace) {1596 if (IS_BLANKZ_AT(string, 1)) { 1534 1597 block_indicators = 1; 1535 1598 } 1536 1599 } 1537 1600 1538 if (CHECK(string, '-') && followed_by_whitespace) {1601 if (CHECK(string, '-') && IS_BLANKZ_AT(string, 1)) { 1539 1602 flow_indicators = 1; 1540 1603 block_indicators = 1; 1541 1604 } … … 1550 1613 1551 1614 if (CHECK(string, ':')) { 1552 1615 flow_indicators = 1; 1553 if ( followed_by_whitespace) {1616 if (IS_BLANKZ_AT(string, 1)) { 1554 1617 block_indicators = 1; 1555 1618 } 1556 1619 } … … 1566 1629 special_characters = 1; 1567 1630 } 1568 1631 1569 if (IS_BREAK(string)) {1570 line_breaks = 1;1571 }1572 1573 1632 if (IS_SPACE(string)) 1574 1633 { 1575 1634 if (string.start == string.pointer) { … … 1586 1645 } 1587 1646 else if (IS_BREAK(string)) 1588 1647 { 1648 line_breaks = 1; 1589 1649 if (string.start == string.pointer) { 1590 1650 leading_break = 1; 1591 1651 } … … 1605 1665 } 1606 1666 1607 1667 preceeded_by_whitespace = IS_BLANKZ(string); 1608 MOVE(string); 1609 if (string.pointer != string.end) { 1610 followed_by_whitespace = IS_BLANKZ_AT(string, WIDTH(string)); 1611 } 1668 MOVEN(string, width); 1612 1669 } 1613 1670 1614 1671 emitter->scalar_data.multiline = line_breaks; … … 1851 1908 if (!WRITE(emitter, string)) return 0; 1852 1909 } 1853 1910 else { 1854 int width = WIDTH(string); 1855 unsigned int value; 1856 while (width --) { 1857 value = *(string.pointer++); 1858 if (!PUT(emitter, '%')) return 0; 1859 if (!PUT(emitter, (value >> 4) 1860 + ((value >> 4) < 10 ? '0' : 'A' - 10))) 1861 return 0; 1862 if (!PUT(emitter, (value & 0x0F) 1863 + ((value & 0x0F) < 10 ? '0' : 'A' - 10))) 1864 return 0; 1865 } 1911 /* %-encode all other bytes, including valid and invalid UTF-8 */ 1912 unsigned char value = *(string.pointer++); 1913 if (!PUT(emitter, '%')) return 0; 1914 if (!PUT(emitter, (value >> 4) 1915 + ((value >> 4) < 10 ? '0' : 'A' - 10))) 1916 return 0; 1917 if (!PUT(emitter, (value & 0x0F) 1918 + ((value & 0x0F) < 10 ? '0' : 'A' - 10))) 1919 return 0; 1866 1920 } 1867 1921 } 1868 1922 … … 1991 2045 return 1; 1992 2046 } 1993 2047 2048 static unsigned char utf8_mask[5] = {0xFF, 0x7F, 0x1F, 0x0F, 0x07}; 2049 1994 2050 static int 1995 2051 yaml_emitter_write_double_quoted_scalar(yaml_emitter_t *emitter, 1996 2052 yaml_char_t *value, size_t length, int allow_breaks) … … 2003 2059 2004 2060 while (string.pointer != string.end) 2005 2061 { 2062 unsigned int width = utf8_width(string.pointer, string.end); 2063 2064 if (width == 0) { 2065 /* 2066 UTF-8 encoding error. This is a byte with the high bit 2067 set. The parser has been altered to read \XNN as a raw 2068 byte. I would prefer to use lowercase x but the old 2069 writer produces that for legal UTF-8 encodings of 2070 U+0080..U+00FF. It is not clear what other parsers will 2071 do with this, though previous libyaml threw an error. 2072 */ 2073 unsigned int value = string.pointer[0]; 2074 int digit; 2075 if (!PUT(emitter, '\\')) return 0; 2076 if (!PUT(emitter, 'X')) return 0; 2077 digit = (value >> 4) & 0x0F; 2078 if (!PUT(emitter, digit + (digit < 10 ? '0' : 'A'-10))) return 0; 2079 digit = value & 0x0F; 2080 if (!PUT(emitter, digit + (digit < 10 ? '0' : 'A'-10))) return 0; 2081 MOVE(string); 2082 spaces = 0; 2083 continue; 2084 } 2085 2006 2086 if (!IS_PRINTABLE(string) || (!emitter->unicode && !IS_ASCII(string)) 2007 2087 || IS_BOM(string) || IS_BREAK(string) 2008 2088 || CHECK(string, '"') || CHECK(string, '\\')) 2009 2089 { 2010 unsigned char octet;2011 unsigned int width;2012 2090 unsigned int value; 2013 2091 int k; 2014 2092 2015 octet = string.pointer[0]; 2016 width = (octet & 0x80) == 0x00 ? 1 : 2017 (octet & 0xE0) == 0xC0 ? 2 : 2018 (octet & 0xF0) == 0xE0 ? 3 : 2019 (octet & 0xF8) == 0xF0 ? 4 : 0; 2020 value = (octet & 0x80) == 0x00 ? octet & 0x7F : 2021 (octet & 0xE0) == 0xC0 ? octet & 0x1F : 2022 (octet & 0xF0) == 0xE0 ? octet & 0x0F : 2023 (octet & 0xF8) == 0xF0 ? octet & 0x07 : 0; 2093 value = string.pointer[0] & utf8_mask[width]; 2024 2094 for (k = 1; k < (int)width; k ++) { 2025 octet = string.pointer[k]; 2026 value = (value << 6) + (octet & 0x3F); 2095 value = (value << 6) + (string.pointer[k] & 0x3F); 2027 2096 } 2028 2097 string.pointer += width; 2029 2098 … … 2092 2161 break; 2093 2162 2094 2163 default: 2095 if (value <= 0xFF) { 2164 // \xNN sequence is disabled so that it may be used in the 2165 // future for invalid byte sequences 2166 /*if (value <= 0xFF) { 2096 2167 if (!PUT(emitter, 'x')) return 0; 2097 2168 width = 2; 2098 2169 } 2099 else if (value <= 0xFFFF) {2170 else*/ if (value <= 0xFFFF) { 2100 2171 if (!PUT(emitter, 'u')) return 0; 2101 2172 width = 4; 2102 2173 } … … 2131 2202 } 2132 2203 else 2133 2204 { 2134 if (!WRITE (emitter, string)) return 0;2205 if (!WRITEN(emitter, string, width)) return 0; 2135 2206 spaces = 0; 2136 2207 } 2137 2208 } -
reader.c
diff -ur original_src/reader.c src/reader.c
old new 187 187 while (parser->raw_buffer.pointer != parser->raw_buffer.last) 188 188 { 189 189 unsigned int value = 0, value2 = 0; 190 int incomplete = 0;191 190 unsigned char octet; 192 191 unsigned int width = 0; 193 192 int low, high; … … 199 198 switch (parser->encoding) 200 199 { 201 200 case YAML_UTF8_ENCODING: 202 203 /*204 * Decode a UTF-8 character. Check RFC 3629205 * (http://www.ietf.org/rfc/rfc3629.txt) for more details.206 *207 * The following table (taken from the RFC) is used for208 * decoding.209 *210 * Char. number range | UTF-8 octet sequence211 * (hexadecimal) | (binary)212 * --------------------+------------------------------------213 * 0000 0000-0000 007F | 0xxxxxxx214 * 0000 0080-0000 07FF | 110xxxxx 10xxxxxx215 * 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx216 * 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx217 *218 * Additionally, the characters in the range 0xD800-0xDFFF219 * are prohibited as they are reserved for use with UTF-16220 * surrogate pairs.221 */222 223 /* Determine the length of the UTF-8 sequence. */224 225 201 octet = parser->raw_buffer.pointer[0]; 226 width = (octet & 0x80) == 0x00 ? 1 : 227 (octet & 0xE0) == 0xC0 ? 2 : 228 (octet & 0xF0) == 0xE0 ? 3 : 229 (octet & 0xF8) == 0xF0 ? 4 : 0; 230 231 /* Check if the leading octet is valid. */ 232 233 if (!width) 234 return yaml_parser_set_reader_error(parser, 235 "Invalid leading UTF-8 octet", 236 parser->offset, octet); 237 238 /* Check if the raw buffer contains an incomplete character. */ 239 240 if (width > raw_unread) { 241 if (parser->eof) { 242 return yaml_parser_set_reader_error(parser, 243 "Incomplete UTF-8 octet sequence", 244 parser->offset, -1); 245 } 246 incomplete = 1; 247 break; 248 } 249 250 /* Decode the leading octet. */ 251 252 value = (octet & 0x80) == 0x00 ? octet & 0x7F : 253 (octet & 0xE0) == 0xC0 ? octet & 0x1F : 254 (octet & 0xF0) == 0xE0 ? octet & 0x0F : 255 (octet & 0xF8) == 0xF0 ? octet & 0x07 : 0; 256 257 /* Check and decode the trailing octets. */ 258 259 for (k = 1; k < width; k ++) 260 { 261 octet = parser->raw_buffer.pointer[k]; 262 263 /* Check if the octet is valid. */ 264 265 if ((octet & 0xC0) != 0x80) 266 return yaml_parser_set_reader_error(parser, 267 "Invalid trailing UTF-8 octet", 268 parser->offset+k, octet); 269 270 /* Decode the octet. */ 271 272 value = (value << 6) + (octet & 0x3F); 273 } 274 275 /* Check the length of the sequence against the value. */ 276 277 if (!((width == 1) || 278 (width == 2 && value >= 0x80) || 279 (width == 3 && value >= 0x800) || 280 (width == 4 && value >= 0x10000))) 281 return yaml_parser_set_reader_error(parser, 282 "Invalid length of a UTF-8 sequence", 283 parser->offset, -1); 284 285 /* Check the range of the value. */ 286 287 if ((value >= 0xD800 && value <= 0xDFFF) || value > 0x10FFFF) 202 /* 203 We can only disallow characters without the high 204 bit set. Characters with the high bit set are required 205 for invalid UTF-8 strings to be encoded, as we 206 cannot rely on any backslash sequences working. 207 */ 208 if (! (octet == 0x09 || octet == 0x0A || octet == 0x0D 209 || (octet >= 0x20 && octet != 0x7F) )) 288 210 return yaml_parser_set_reader_error(parser, 289 " Invalid Unicode character",211 "Control characters are not allowed", 290 212 parser->offset, value); 291 213 214 parser->raw_buffer.pointer++; 215 parser->offset++; 216 *(parser->buffer.last++) = octet; 292 217 break; 293 218 294 219 case YAML_UTF16LE_ENCODING: … … 331 256 "Incomplete UTF-16 character", 332 257 parser->offset, -1); 333 258 } 334 incomplete = 1;335 259 break; 336 260 } 337 261 … … 339 263 340 264 value = parser->raw_buffer.pointer[low] 341 265 + (parser->raw_buffer.pointer[high] << 8); 342 343 /* Check for unexpected low surrogate area. */ 344 345 if ((value & 0xFC00) == 0xDC00) 346 return yaml_parser_set_reader_error(parser, 347 "Unexpected low surrogate area", 348 parser->offset, value); 266 width = 2; 349 267 350 268 /* Check for a high surrogate area. */ 351 269 352 270 if ((value & 0xFC00) == 0xD800) { 353 271 354 width = 4;355 356 272 /* Check for incomplete surrogate pair. */ 357 273 358 274 if (raw_unread < 4) { 359 if (parser->eof) { 360 return yaml_parser_set_reader_error(parser, 361 "Incomplete UTF-16 surrogate pair", 362 parser->offset, -1); 275 if (parser->eof) { /* trailing high surrogate */ 276 width = 2; 277 } else { 278 break; /* Can't tell until we have more raw characters */ 279 } 280 } else { 281 282 /* Get the next character. */ 283 284 value2 = parser->raw_buffer.pointer[low+2] 285 + (parser->raw_buffer.pointer[high+2] << 8); 286 287 /* Check for a low surrogate area. */ 288 if ((value2 & 0xFC00) == 0xDC00) { 289 width = 4; 290 /* Generate the value of the surrogate pair. */ 291 value = 0x10000 + ((value & 0x3FF) << 10) + (value2 & 0x3FF); 363 292 } 364 incomplete = 1;365 break;366 293 } 294 } 367 295 368 /* Get the nextcharacter. */296 /* Check if the raw buffer contains enough bytes to form a character. */ 369 297 370 value2 = parser->raw_buffer.pointer[low+2] 371 + (parser->raw_buffer.pointer[high+2] << 8); 298 /* 299 * Check if the character is in the allowed range: 300 * #x9 | #xA | #xD | [#x20-#x7E] (8 bit) 301 * | #x85 | [#xA0-#xD7FF] | [#xE000-#xFFFD] (16 bit) 302 * | [#x10000-#x10FFFF] (32 bit) 303 */ 304 /* Modified to allow all 16-bit values as \uNNNN may not 305 work for some parsers for these values. */ 306 if (! (value == 0x09 || value == 0x0A || value == 0x0D 307 || (value >= 0x20 && value <= 0x7E) 308 || (value == 0x85) || value >= 0xA0)) 309 return yaml_parser_set_reader_error(parser, 310 "Control characters are not allowed", 311 parser->offset, value); 372 312 373 /* Check for a low surrogate area. */313 /* Move the raw pointers. */ 374 314 375 if ((value2 & 0xFC00) != 0xDC00) 376 return yaml_parser_set_reader_error(parser, 377 "Expected low surrogate area", 378 parser->offset+2, value2); 315 parser->raw_buffer.pointer += width; 316 parser->offset += width; 379 317 380 /* Generate the value of the surrogate pair. */318 /* Finally put the character into the buffer. */ 381 319 382 value = 0x10000 + ((value & 0x3FF) << 10) + (value2 & 0x3FF); 320 /* 0000 0000-0000 007F -> 0xxxxxxx */ 321 if (value <= 0x7F) { 322 *(parser->buffer.last++) = value; 383 323 } 384 324 /* 0000 0080-0000 07FF -> 110xxxxx 10xxxxxx */ 325 else if (value <= 0x7FF) { 326 *(parser->buffer.last++) = 0xC0 + (value >> 6); 327 *(parser->buffer.last++) = 0x80 + (value & 0x3F); 328 } 329 /* 0000 0800-0000 FFFF -> 1110xxxx 10xxxxxx 10xxxxxx */ 330 else if (value <= 0xFFFF) { 331 *(parser->buffer.last++) = 0xE0 + (value >> 12); 332 *(parser->buffer.last++) = 0x80 + ((value >> 6) & 0x3F); 333 *(parser->buffer.last++) = 0x80 + (value & 0x3F); 334 } 335 /* 0001 0000-0010 FFFF -> 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ 385 336 else { 386 width = 2; 337 *(parser->buffer.last++) = 0xF0 + (value >> 18); 338 *(parser->buffer.last++) = 0x80 + ((value >> 12) & 0x3F); 339 *(parser->buffer.last++) = 0x80 + ((value >> 6) & 0x3F); 340 *(parser->buffer.last++) = 0x80 + (value & 0x3F); 387 341 } 388 389 342 break; 390 343 391 344 default: 392 345 assert(1); /* Impossible. */ 393 346 } 394 347 395 /* Check if the raw buffer contains enough bytes to form a character. */396 397 if (incomplete) break;398 399 /*400 * Check if the character is in the allowed range:401 * #x9 | #xA | #xD | [#x20-#x7E] (8 bit)402 * | #x85 | [#xA0-#xD7FF] | [#xE000-#xFFFD] (16 bit)403 * | [#x10000-#x10FFFF] (32 bit)404 */405 406 if (! (value == 0x09 || value == 0x0A || value == 0x0D407 || (value >= 0x20 && value <= 0x7E)408 || (value == 0x85) || (value >= 0xA0 && value <= 0xD7FF)409 || (value >= 0xE000 && value <= 0xFFFD)410 || (value >= 0x10000 && value <= 0x10FFFF)))411 return yaml_parser_set_reader_error(parser,412 "Control characters are not allowed",413 parser->offset, value);414 415 /* Move the raw pointers. */416 417 parser->raw_buffer.pointer += width;418 parser->offset += width;419 420 /* Finally put the character into the buffer. */421 422 /* 0000 0000-0000 007F -> 0xxxxxxx */423 if (value <= 0x7F) {424 *(parser->buffer.last++) = value;425 }426 /* 0000 0080-0000 07FF -> 110xxxxx 10xxxxxx */427 else if (value <= 0x7FF) {428 *(parser->buffer.last++) = 0xC0 + (value >> 6);429 *(parser->buffer.last++) = 0x80 + (value & 0x3F);430 }431 /* 0000 0800-0000 FFFF -> 1110xxxx 10xxxxxx 10xxxxxx */432 else if (value <= 0xFFFF) {433 *(parser->buffer.last++) = 0xE0 + (value >> 12);434 *(parser->buffer.last++) = 0x80 + ((value >> 6) & 0x3F);435 *(parser->buffer.last++) = 0x80 + (value & 0x3F);436 }437 /* 0001 0000-0010 FFFF -> 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */438 else {439 *(parser->buffer.last++) = 0xF0 + (value >> 18);440 *(parser->buffer.last++) = 0x80 + ((value >> 12) & 0x3F);441 *(parser->buffer.last++) = 0x80 + ((value >> 6) & 0x3F);442 *(parser->buffer.last++) = 0x80 + (value & 0x3F);443 }444 445 348 parser->unread ++; 446 349 } 447 350 -
scanner.c
diff -ur original_src/scanner.c src/scanner.c
old new 495 495 (parser->mark.index ++, \ 496 496 parser->mark.column ++, \ 497 497 parser->unread --, \ 498 parser->buffer.pointer + = WIDTH(parser->buffer))498 parser->buffer.pointer ++) 499 499 500 #define SKIPN(parser,n) \ 501 (parser->mark.index ++, \ 502 parser->mark.column ++, \ 503 parser->unread --, \ 504 parser->buffer.pointer += (n)) 500 505 #define SKIP_LINE(parser) \ 501 506 (IS_CRLF(parser->buffer) ? \ 502 507 (parser->mark.index += 2, \ … … 523 528 parser->unread --, \ 524 529 1) : 0) 525 530 531 #define READN(parser,string,n) \ 532 (STRING_EXTEND(parser,string) ? \ 533 (COPYN(string,parser->buffer,n), \ 534 parser->mark.index ++, \ 535 parser->mark.column ++, \ 536 parser->unread --, \ 537 1) : 0) 538 526 539 /* 527 540 * Copy a line break character to a string buffer and advance pointers. 528 541 */ … … 1924 1937 if (!CACHE(parser, 1)) return 0; 1925 1938 1926 1939 if (parser->mark.column == 0 && IS_BOM(parser->buffer)) 1927 SKIP (parser);1940 SKIPN(parser,3); /* UTF-8 BOM is 3 bytes */ 1928 1941 1929 1942 /* 1930 1943 * Eat whitespaces. … … 2681 2694 2682 2695 octet = (AS_HEX_AT(parser->buffer, 1) << 4) + AS_HEX_AT(parser->buffer, 2); 2683 2696 2684 /* If it is the leading octet, determine the length of the UTF-8 sequence. */2685 2686 if (!width)2687 {2688 width = (octet & 0x80) == 0x00 ? 1 :2689 (octet & 0xE0) == 0xC0 ? 2 :2690 (octet & 0xF0) == 0xE0 ? 3 :2691 (octet & 0xF8) == 0xF0 ? 4 : 0;2692 if (!width) {2693 return yaml_parser_set_scanner_error(parser, directive ?2694 "while parsing a %TAG directive" : "while parsing a tag",2695 start_mark, "found an incorrect leading UTF-8 octet");2696 }2697 }2698 else2699 {2700 /* Check if the trailing octet is correct. */2701 2702 if ((octet & 0xC0) != 0x80) {2703 return yaml_parser_set_scanner_error(parser, directive ?2704 "while parsing a %TAG directive" : "while parsing a tag",2705 start_mark, "found an incorrect trailing UTF-8 octet");2706 }2707 }2708 2709 2697 /* Copy the octet and move the pointers. */ 2710 2698 2711 2699 *(string->pointer++) = octet; … … 3102 3090 else if (!single && CHECK(parser->buffer, '\\')) 3103 3091 { 3104 3092 size_t code_length = 0; 3093 int raw_code = 0; 3105 3094 3106 3095 if (!STRING_EXTEND(parser, string)) goto error; 3107 3096 … … 3188 3177 code_length = 2; 3189 3178 break; 3190 3179 3180 case 'X': 3181 code_length = 2; 3182 raw_code = 1; 3183 break; 3184 3191 3185 case 'u': 3192 3186 code_length = 4; 3193 3187 break; … … 3233 3227 goto error; 3234 3228 } 3235 3229 3236 if (value <= 0x7F ) {3230 if (value <= 0x7F || raw_code) { 3237 3231 *(string.pointer++) = value; 3238 3232 } 3239 3233 else if (value <= 0x7FF) { … … 3262 3256 3263 3257 else 3264 3258 { 3265 /* It is a non-escaped non-blank character. */3259 /* It is a non-escaped non-blank byte. */ 3266 3260 3267 3261 if (!READ(parser, string)) goto error; 3268 3262 } … … 3474 3468 } 3475 3469 } 3476 3470 3477 /* Copy the character. */3471 /* Copy the byte. */ 3478 3472 3479 3473 if (!READ(parser, string)) goto error; 3480 3474 -
yaml_private.h
diff -ur original_src/yaml_private.h src/yaml_private.h
old new 25 25 yaml_strdup(const yaml_char_t *); 26 26 27 27 /* 28 * Reader: Ensure that the buffer contains at least `length` characters.28 * Reader: Ensure that the buffer contains at least `length` bytes. 29 29 */ 30 30 31 31 YAML_DECLARE(int) … … 237 237 238 238 /* 239 239 * Check if the character can be printed unescaped. 240 * Only correct if you know you are looking at valid UTF-8! 240 241 */ 241 242 242 243 #define IS_PRINTABLE_AT(string,offset) \ … … 355 356 356 357 /* 357 358 * Determine the width of the character. 359 * Only correct if you know you are looking at valid UTF-8! 358 360 */ 359 361 360 362 #define WIDTH_AT(string,offset) \ … … 366 368 #define WIDTH(string) WIDTH_AT((string),0) 367 369 368 370 /* 369 * Move the string pointer to the next character.371 * Move the string pointer to the next byte. 370 372 */ 371 373 372 #define MOVE(string) ((string).pointer += WIDTH((string))) 374 #define MOVE(string) ((string).pointer++) 375 376 #define MOVEN(string,n) ((string).pointer += n) 373 377 374 378 /* 375 * Copy a characterand move the pointers of both strings.379 * Copy a byte and move the pointers of both strings. 376 380 */ 377 381 378 382 #define COPY(string_a,string_b) \ 379 ((*(string_b).pointer & 0x80) == 0x00 ? \ 380 (*((string_a).pointer++) = *((string_b).pointer++)) : \ 381 (*(string_b).pointer & 0xE0) == 0xC0 ? \ 382 (*((string_a).pointer++) = *((string_b).pointer++), \ 383 *((string_a).pointer++) = *((string_b).pointer++)) : \ 384 (*(string_b).pointer & 0xF0) == 0xE0 ? \ 385 (*((string_a).pointer++) = *((string_b).pointer++), \ 386 *((string_a).pointer++) = *((string_b).pointer++), \ 387 *((string_a).pointer++) = *((string_b).pointer++)) : \ 388 (*(string_b).pointer & 0xF8) == 0xF0 ? \ 389 (*((string_a).pointer++) = *((string_b).pointer++), \ 390 *((string_a).pointer++) = *((string_b).pointer++), \ 391 *((string_a).pointer++) = *((string_b).pointer++), \ 392 *((string_a).pointer++) = *((string_b).pointer++)) : 0) 383 (*((string_a).pointer++) = *((string_b).pointer++)) 384 385 #define COPYN(string_a,string_b,n) \ 386 (memcpy(string_a.pointer, string_b.pointer, n), \ 387 string_a.pointer += n, string_b.pointer += n) 393 388 394 389 /* 395 390 * Stack and queue management.
