Changeset 181
- Timestamp:
- 05/30/06 13:49:18 (2 years ago)
- Files:
-
- libyaml/trunk/include/yaml/yaml.h (modified) (3 diffs)
- libyaml/trunk/src/reader.c (modified) (9 diffs)
- libyaml/trunk/tests/Makefile.am (modified) (1 diff)
- libyaml/trunk/tests/test-reader.c (copied) (copied from libyaml/trunk/tests/test-version.c) (1 diff)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
libyaml/trunk/include/yaml/yaml.h
r180 r181 292 292 */ 293 293 294 /** Error type. */ 294 295 yaml_error_type_t error; 296 297 /** Error description. */ 298 const char *problem; 299 300 /** The byte about which the problem occured. */ 301 size_t problem_offset; 295 302 296 303 /** … … 303 310 */ 304 311 305 /** Read handler */312 /** Read handler. */ 306 313 yaml_read_handler_t *read_handler; 307 314 … … 489 496 #define YAML_BUFFER_SIZE (YAML_RAW_BUFFER_SIZE*3) 490 497 498 /** 499 * Ensure that the buffer contains at least @a length characters. 500 * 501 * @param[in] parser A parser object. 502 * @param[in] length The number of characters in the buffer. 503 * 504 * @returns @c 1 on success, @c 0 on error. 505 */ 506 507 int 508 yaml_parser_update_buffer(yaml_parser_t *parser, size_t length); 509 491 510 /** @} */ 492 511 libyaml/trunk/src/reader.c
r180 r181 8 8 #include <assert.h> 9 9 10 /* Check for the UTF-16-BE BOM. */11 #define IS_UTF16BE_BOM(pointer) ((pointer)[0] == 0xFE && (pointer)[1] == 0xFF)12 13 /* Check for the UTF-16-LE BOM. */14 #define IS_UTF16LE_BOM(pointer) ((pointer)[0] == 0xFF && (pointer)[1] == 0xFE)15 16 /* Get a UTF-16-BE character. */17 #define UTF16BE_CHAR(pointer) ((pointer)[0] << 8 + (pointer)[1])18 19 /* Get a UTF-16-LE character. */20 #define UTF16LE_CHAR(pointer) ((pointer)[0] + (pointer)[1] << 8)21 22 10 /* 23 * From http://www.ietf.org/rfc/rfc3629.txt: 24 * 25 * Char. number range | UTF-8 octet sequence 26 * (hexadecimal) | (binary) 27 * --------------------+--------------------------------------------- 28 * 0000 0000-0000 007F | 0xxxxxxx 29 * 0000 0080-0000 07FF | 110xxxxx 10xxxxxx 30 * 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx 31 * 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 11 * Set the reader error and return 0. 32 12 */ 33 13 34 /* Get the length of a UTF-8 character (0 on error). */ 35 #define UTF8_LENGTH(pointer) \ 36 ((pointer)[0] < 0x80 ? 1 : \ 37 (pointer)[0] < 0xC0 ? 0 : \ 38 (pointer)[0] < 0xE0 ? 2 : \ 39 (pointer)[0] < 0xF0 ? 3 : \ 40 (pointer)[0] < 0xF8 ? 4 : 0) 41 42 /* Get the value of the first byte of a UTF-8 sequence (0xFF on error). */ 43 #define UTF8_FIRST_CHUNK(pointer) \ 44 ((pointer)[0] < 0x80 ? (pointer)[0] & 0x7F : \ 45 (pointer)[0] < 0xC0 ? 0xFF : \ 46 (pointer)[0] < 0xE0 ? (pointer)[0] & 0x1F : \ 47 (pointer)[0] < 0xF0 ? (pointer)[0] & 0x0F : \ 48 (pointer)[0] < 0xF8 ? (pointer)[0] & 0x07 : 0xFF) 49 50 /* Get the value of a non-first byte of a UTF-8 sequence (0xFF on error). */ 51 #define UTF8_NEXT_CHUNK(pointer) \ 52 ((pointer)[0] >= 0x80 && (pointer)[0] < 0xC0 ? (pointer)[0] & 0x3F : 0xFF) 53 54 /* Determine the length of a UTF-8 character. */ 14 int 15 yaml_parser_set_reader_error(yaml_parser_t *parser, const char *problem) 16 { 17 parser->error = YAML_READER_ERROR; 18 parser->problem = problem; 19 parser->problem_offset = parser->offset; 20 21 return 0; 22 } 23 55 24 56 25 /* … … 103 72 if (!yaml_parser_update_raw_buffer(parser)) return 0; 104 73 74 /* If the raw buffer is empty, it is EOF. */ 75 76 if (!parser->raw_unread) return 1; 77 105 78 /* Decode the raw buffer. */ 106 79 107 80 while (parser->raw_unread) 108 81 { 109 unsigned int ch;82 unsigned int value, value2; 110 83 int incomplete = 0; 84 unsigned char utf8_octet; 85 unsigned int utf8_length; 86 int k, low, high; 111 87 112 88 /* Decode the next character. */ … … 116 92 case YAML_UTF8_ENCODING: 117 93 118 unsigned int utf8_length = UTF8_LENGTH(parser->raw_pointer); 119 unsigned int utf8_chunk; 94 /* 95 * Decode a UTF-8 character. Check RFC 3629 96 * (http://www.ietf.org/rfc/rfc3629.txt) for more details. 97 * 98 * The following table (taken from the RFC) is used for 99 * decoding. 100 * 101 * Char. number range | UTF-8 octet sequence 102 * (hexadecimal) | (binary) 103 * --------------------+------------------------------------ 104 * 0000 0000-0000 007F | 0xxxxxxx 105 * 0000 0080-0000 07FF | 110xxxxx 10xxxxxx 106 * 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx 107 * 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 108 * 109 * Additionally, the characters in the range 0xD800-0xDFFF 110 * are prohibited as they are reserved for use with UTF-16 111 * surrogate pairs. 112 */ 113 114 /* Determine the length of the UTF-8 sequence. */ 115 116 utf8_octet = parser->raw_pointer[0]; 117 utf8_length = ( 118 (utf8_octet & 0x80) == 0x00 ? 1 : 119 (utf8_octet & 0xE0) == 0xC0 ? 2 : 120 (utf8_octet & 0xF0) == 0xE0 ? 3 : 121 (utf8_octet & 0xF8) == 0xF0 ? 4 : 0); 122 123 /* Check if the leading octet is valid. */ 124 125 if (!utf8_length) 126 return yaml_parser_set_reader_error(parser, 127 "Invalid leading UTF-8 octet"); 120 128 121 129 /* Check if the raw buffer contains an incomplete character. */ … … 123 131 if (utf8_length > parser->raw_unread) { 124 132 if (parser->eof) { 125 parser->error = YAML_READER_ERROR;126 return 0;133 return yaml_parser_set_reader_error(parser, 134 "Incomplete UTF-8 octet sequence"); 127 135 } 128 136 incomplete = 1; 137 break; 129 138 } 130 139 131 /* Get the character checking it for validity. */ 132 133 utf8_chunk = UTF8_FIRST_CHUNK(parser->raw_pointer ++); 134 if (utf8_chunk == 0xFF) { 135 parser->error = YAML_READER_ERROR; 136 return 0; 140 /* Decode the leading octet. */ 141 142 value = ( 143 (utf8_octet & 0x80) == 0x00 ? utf8_octet & 0x7F : 144 (utf8_octet & 0xE0) == 0xC0 ? utf8_octet & 0x1F : 145 (utf8_octet & 0xF0) == 0xE0 ? utf8_octet & 0x0F : 146 (utf8_octet & 0xF8) == 0xF0 ? utf8_octet & 0x07 : 0); 147 148 /* Check and decode the trailing octets. */ 149 150 for (k = 1; k < utf8_length; k ++) 151 { 152 utf8_octet = parser->raw_pointer[k]; 153 154 /* Check if the octet is valid. */ 155 156 if ((utf8_octet & 0xC0) != 0x80) 157 return yaml_parser_set_reader_error(parser, 158 "Invalid trailing UTF-8 octet"); 159 160 /* Decode the octet. */ 161 162 value = (value << 6) + (utf8_octet & 0x3F); 137 163 } 138 ch = utf8_chunk; 139 parser->raw_unread --; 140 while (-- utf8_length) { 141 utf8_chunk = UTF8_NEXT_CHUNK(parser->raw_pointer ++); 142 if (utf8_chunk == 0xFF) { 143 parser->error = YAML_READER_ERROR; 144 return 0; 145 } 146 ch = ch << 6 + utf8_chunk; 147 parser->raw_unread --; 148 } 164 165 /* Check the length of the sequence against the value. */ 166 167 if (!((utf8_length == 1) || 168 (utf8_length == 2 && value >= 0x80) || 169 (utf8_length == 3 && value >= 0x800) || 170 (utf8_length == 4 && value >= 0x10000))) 171 return yaml_parser_set_reader_error(parser, 172 "Invalid length of a UTF-8 sequence"); 173 174 /* Check the range of the value. */ 175 176 if ((value >= 0xD800 && value <= 0xDFFF) || value > 0x10FFFF) 177 return yaml_parser_set_reader_error(parser, 178 "Invalid Unicode character"); 179 180 parser->raw_pointer += utf8_length; 181 parser->raw_unread -= utf8_length; 182 parser->offset += utf8_length; 149 183 150 184 break; 151 185 152 186 case YAML_UTF16LE_ENCODING: 153 154 /* Check if the raw buffer contains an incomplete character. */ 187 case YAML_UTF16BE_ENCODING: 188 189 low = (parser->encoding == YAML_UTF16LE_ENCODING ? 0 : 1); 190 high = (parser->encoding == YAML_UTF16LE_ENCODING ? 1 : 0); 191 192 /* 193 * The UTF-16 encoding is not as simple as one might 194 * naively think. Check RFC 2781 195 * (http://www.ietf.org/rfc/rfc2781.txt). 196 * 197 * Normally, two subsequent bytes describe a Unicode 198 * character. However a special technique (called a 199 * surrogate pair) is used for specifying character 200 * values larger than 0xFFFF. 201 * 202 * A surrogate pair consists of two pseudo-characters: 203 * high surrogate area (0xD800-0xDBFF) 204 * low surrogate area (0xDC00-0xDFFF) 205 * 206 * The following formulas are used for decoding 207 * and encoding characters using surrogate pairs: 208 * 209 * U = U' + 0x10000 (0x01 00 00 <= U <= 0x10 FF FF) 210 * U' = yyyyyyyyyyxxxxxxxxxx (0 <= U' <= 0x0F FF FF) 211 * W1 = 110110yyyyyyyyyy 212 * W2 = 110111xxxxxxxxxx 213 * 214 * where U is the character value, W1 is the high surrogate 215 * area, W2 is the low surrogate area. 216 */ 217 218 /* Check for incomplete UTF-16 character. */ 155 219 156 220 if (parser->raw_unread < 2) { 157 221 if (parser->eof) { 158 parser->error = YAML_READER_ERROR;159 return 0;222 return yaml_parser_set_reader_error(parser, 223 "Incomplete UTF-16 character"); 160 224 } 161 225 incomplete = 1; 226 break; 162 227 } 163 228 164 /* Get the current character. */ 165 166 ch = UTF16LE_CHAR(parser->raw_pointer); 167 parser->raw_pointer += 2; 168 parser->raw_unread -= 2; 169 170 break; 171 172 case YAML_UTF16BE_ENCODING: 173 174 /* Check if the raw buffer contains an incomplete character. */ 175 176 if (parser->raw_unread < 2) { 177 if (parser->eof) { 178 parser->error = YAML_READER_ERROR; 179 return 0; 229 /* Get the character. */ 230 231 value = parser->raw_pointer[low] 232 + (parser->raw_pointer[high] << 8); 233 234 /* Check for unexpected low surrogate area. */ 235 236 if ((value & 0xFC00) == 0xDC00) 237 return yaml_parser_set_reader_error(parser, 238 "Unexpected low surrogate area"); 239 240 /* Check for a high surrogate area. */ 241 242 if ((value & 0xFC00) == 0xD800) { 243 244 /* Check for incomplete surrogate pair. */ 245 246 if (parser->raw_unread < 4) { 247 if (parser->eof) { 248 return yaml_parser_set_reader_error(parser, 249 "Incomplete UTF-16 surrogate pair"); 250 } 251 incomplete = 1; 252 break; 180 253 } 181 incomplete = 1; 254 255 /* Get the next character. */ 256 257 unsigned int value2 = parser->raw_pointer[low+2] 258 + (parser->raw_pointer[high+2] << 8); 259 260 /* Check for a low surrogate area. */ 261 262 if ((value2 & 0xFC00) != 0xDC00) 263 return yaml_parser_set_reader_error(parser, 264 "Expected low surrogate area"); 265 266 /* Generate the value of the surrogate pair. */ 267 268 value = 0x10000 + ((value & 0x3FF) << 10) + (value2 & 0x3FF); 269 270 parser->raw_pointer += 4; 271 parser->raw_unread -= 4; 272 parser->offset += 4; 182 273 } 183 274 184 /* Get the current character. */185 186 ch = UTF16BE_CHAR(parser->raw_pointer);187 parser->raw_pointer += 2;188 parser->raw_unread -= 2;275 else { 276 parser->raw_pointer += 2; 277 parser->raw_unread -= 2; 278 parser->offset += 4; 279 } 189 280 190 281 break; 191 282 } 283 284 /* Check if the raw buffer contains enough bytes to form a character. */ 285 286 if (incomplete) break; 192 287 193 288 /* … … 198 293 */ 199 294 200 if (! (ch == 0x09 || ch == 0x0A || ch == 0x0D 201 || (ch >= 0x20 && ch <= 0x7E) 202 || (ch == 0x85) || (ch >= 0xA0 && ch <= 0xD7FF) 203 || (ch >= 0xE000 && ch <= 0xFFFD) 204 || (ch >= 0x10000 && ch <= 0x10FFFF))) { 205 parser->error = YAML_READER_ERROR; 206 return 0; 207 } 295 if (! (value == 0x09 || value == 0x0A || value == 0x0D 296 || (value >= 0x20 && value <= 0x7E) 297 || (value == 0x85) || (value >= 0xA0 && value <= 0xD7FF) 298 || (value >= 0xE000 && value <= 0xFFFD) 299 || (value >= 0x10000 && value <= 0x10FFFF))) 300 return yaml_parser_set_reader_error(parser, 301 "Control characters are not allowed"); 208 302 209 303 /* Finally put the character into the buffer. */ 210 304 211 305 /* 0000 0000-0000 007F -> 0xxxxxxx */ 212 if ( ch<= 0x7F) {213 *(parser->buffer_end++) = ch;306 if (value <= 0x7F) { 307 *(parser->buffer_end++) = value; 214 308 } 215 309 /* 0000 0080-0000 07FF -> 110xxxxx 10xxxxxx */ 216 else if ( ch<= 0x7FF) {217 *(parser->buffer_end++) = 0xC0 + ( ch >> 6) & 0x1F;218 *(parser->buffer_end++) = 0x80 + ch& 0x3F;310 else if (value <= 0x7FF) { 311 *(parser->buffer_end++) = 0xC0 + (value >> 6); 312 *(parser->buffer_end++) = 0x80 + value & 0x3F; 219 313 } 220 314 /* 0000 0800-0000 FFFF -> 1110xxxx 10xxxxxx 10xxxxxx */ 221 else if ( ch<= 0xFFFF) {222 *(parser->buffer_end++) = 0x 80 + ch & 0x3F;223 *(parser->buffer_end++) = 0x C0 + (ch >> 6) & 0x1F;224 315 else if (value <= 0xFFFF) { 316 *(parser->buffer_end++) = 0xE0 + (value >> 12); 317 *(parser->buffer_end++) = 0x80 + (value >> 6) & 0x3F; 318 *(parser->buffer_end++) = 0x80 + value & 0x3F; 225 319 } 226 320 /* 0001 0000-0010 FFFF -> 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ 227 321 else { 322 *(parser->buffer_end++) = 0xF0 + (value >> 18); 323 *(parser->buffer_end++) = 0x80 + (value >> 12) & 0x3F; 324 *(parser->buffer_end++) = 0x80 + (value >> 6) & 0x3F; 325 *(parser->buffer_end++) = 0x80 + value & 0x3F; 228 326 } 229 327 } 230 231 } 232 328 } 329 330 return 1; 233 331 } 234 332 … … 238 336 */ 239 337 338 #define BOM_UTF8 "\xef\xbb\xbf" 339 #define BOM_UTF16LE "\xff\xfe" 340 #define BOM_UTF16BE "\xfe\xff" 341 240 342 int 241 343 yaml_parser_determine_encoding(yaml_parser_t *parser) … … 243 345 /* Ensure that we had enough bytes in the raw buffer. */ 244 346 245 while (!parser->eof && parser->raw_unread < 2) {347 while (!parser->eof && parser->raw_unread < 3) { 246 348 if (!yaml_parser_update_raw_buffer(parser)) { 247 349 return 0; … … 251 353 /* Determine the encoding. */ 252 354 253 if (parser->raw_unread >= 2 && IS_UTF16BE_BOM(parser->raw_pointer)) { 355 if (parser->raw_unread >= 2 356 && !memcmp(parser->raw_pointer, BOM_UTF16LE, 2)) { 357 parser->encoding = YAML_UTF16LE_ENCODING; 358 parser->raw_pointer += 2; 359 parser->raw_unread -= 2; 360 } 361 else if (parser->raw_unread >= 2 362 && !memcmp(parser->raw_pointer, BOM_UTF16BE, 2)) { 254 363 parser->encoding = YAML_UTF16BE_ENCODING; 255 } 256 else if (parser->raw_unread >= 2 && IS_UTF16LE_BOM(parser->raw_pointer)) { 257 parser->encoding = YAML_UTF16LE_ENCODING; 364 parser->raw_pointer += 2; 365 parser->raw_unread -= 2; 366 } 367 else if (parser->raw_unread >= 3 368 && !memcmp(parser->raw_pointer, BOM_UTF8, 3)) { 369 parser->encoding = YAML_UTF8_ENCODING; 370 parser->raw_pointer += 3; 371 parser->raw_unread -= 3; 258 372 } 259 373 else { 260 374 parser->encoding = YAML_UTF8_ENCODING; 261 375 } 376 377 return 1; 262 378 } 263 379 … … 292 408 YAML_RAW_BUFFER_SIZE - parser->raw_unread, 293 409 &size_read)) { 294 parser->error = YAML_READER_ERROR; 295 return 0; 410 return yaml_parser_set_reader_error(parser, "Input error"); 296 411 } 297 412 parser->raw_unread += size_read; libyaml/trunk/tests/Makefile.am
r172 r181 1 1 AM_CPPFLAGS = -I$(top_srcdir)/include 2 2 LDADD = $(top_builddir)/src/libyaml.la 3 TESTS = test-version 4 check_PROGRAMS = test-version 3 TESTS = test-version test-reader 4 check_PROGRAMS = test-version test-reader libyaml/trunk/tests/test-reader.c
r172 r181 5 5 #include <assert.h> 6 6 7 /* 8 * Test cases are stolen from 9 * http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt 10 */ 11 12 typedef struct { 13 char *title; 14 char *test; 15 int result; 16 } test_case; 17 18 test_case utf8_sequences[] = { 19 /* {"title", "test 1|test 2|...|test N!", (0 or 1)}, */ 20 21 {"a simple test", "'test' is '\xd0\xbf\xd1\x80\xd0\xbe\xd0\xb2\xd0\xb5\xd1\x80\xd0\xba\xd0\xb0' in Russian!", 1}, 22 {"an empty line", "!", 1}, 23 24 {"u-0 is a control character", "\x00!", 0}, 25 {"u-80 is a control character", "\xc2\x80!", 0}, 26 {"u-800 is valid", "\xe0\xa0\x80!", 1}, 27 {"u-10000 is valid", "\xf0\x90\x80\x80!", 1}, 28 {"5 bytes sequences are not allowed", "\xf8\x88\x80\x80\x80!", 0}, 29 {"6 bytes sequences are not allowed", "\xfc\x84\x80\x80\x80\x80!", 0}, 30 31 {"u-7f is a control character", "\x7f!", 0}, 32 {"u-7FF is valid", "\xdf\xbf!", 1}, 33 {"u-FFFF is a control character", "\xef\xbf\xbf!", 0}, 34 {"u-1FFFFF is too large", "\xf7\xbf\xbf\xbf!", 0}, 35 {"u-3FFFFFF is 5 bytes", "\xfb\xbf\xbf\xbf\xbf!", 0}, 36 {"u-7FFFFFFF is 6 bytes", "\xfd\xbf\xbf\xbf\xbf\xbf!", 0}, 37 38 {"u-D7FF", "\xed\x9f\xbf!", 1}, 39 {"u-E000", "\xee\x80\x80!", 1}, 40 {"u-FFFD", "\xef\xbf\xbd!", 1}, 41 {"u-10FFFF", "\xf4\x8f\xbf\xbf!", 1}, 42 {"u-110000", "\xf4\x90\x80\x80!", 0}, 43 44 {"first continuation byte", "\x80!", 0}, 45 {"last continuation byte", "\xbf!", 0}, 46 47 {"2 continuation bytes", "\x80\xbf!", 0}, 48 {"3 continuation bytes", "\x80\xbf\x80!", 0}, 49 {"4 continuation bytes", "\x80\xbf\x80\xbf!", 0}, 50 {"5 continuation bytes", "\x80\xbf\x80\xbf\x80!", 0}, 51 {"6 continuation bytes", "\x80\xbf\x80\xbf\x80\xbf!", 0}, 52 {"7 continuation bytes", "\x80\xbf\x80\xbf\x80\xbf\x80!", 0}, 53 54 {"sequence of all 64 possible continuation bytes", 55 "\x80|\x81|\x82|\x83|\x84|\x85|\x86|\x87|\x88|\x89|\x8a|\x8b|\x8c|\x8d|\x8e|\x8f|" 56 "\x90|\x91|\x92|\x93|\x94|\x95|\x96|\x97|\x98|\x99|\x9a|\x9b|\x9c|\x9d|\x9e|\x9f|" 57 "\xa0|\xa1|\xa2|\xa3|\xa4|\xa5|\xa6|\xa7|\xa8|\xa9|\xaa|\xab|\xac|\xad|\xae|\xaf|" 58 "\xb0|\xb1|\xb2|\xb3|\xb4|\xb5|\xb6|\xb7|\xb8|\xb9|\xba|\xbb|\xbc|\xbd|\xbe|\xbf!", 0}, 59 {"32 first bytes of 2-byte sequences {0xc0-0xdf}", 60 "\xc0 |\xc1 |\xc2 |\xc3 |\xc4 |\xc5 |\xc6 |\xc7 |\xc8 |\xc9 |\xca |\xcb |\xcc |\xcd |\xce |\xcf |" 61 "\xd0 |\xd1 |\xd2 |\xd3 |\xd4 |\xd5 |\xd6 |\xd7 |\xd8 |\xd9 |\xda |\xdb |\xdc |\xdd |\xde |\xdf !", 0}, 62 {"16 first bytes of 3-byte sequences {0xe0-0xef}", 63 "\xe0 |\xe1 |\xe2 |\xe3 |\xe4 |\xe5 |\xe6 |\xe7 |\xe8 |\xe9 |\xea |\xeb |\xec |\xed |\xee |\xef !", 0}, 64 {"8 first bytes of 4-byte sequences {0xf0-0xf7}", "\xf0 |\xf1 |\xf2 |\xf3 |\xf4 |\xf5 |\xf6 |\xf7 !", 0}, 65 {"4 first bytes of 5-byte sequences {0xf8-0xfb}", "\xf8 |\xf9 |\xfa |\xfb !", 0}, 66 {"2 first bytes of 6-byte sequences {0xfc-0xfd}", "\xfc |\xfd !", 0}, 67 68 {"sequences with last byte missing {u-0}", 69 "\xc0|\xe0\x80|\xf0\x80\x80|\xf8\x80\x80\x80|\xfc\x80\x80\x80\x80!", 0}, 70 {"sequences with last byte missing {u-...FF}", 71 "\xdf|\xef\xbf|\xf7\xbf\xbf|\xfb\xbf\xbf\xbf|\xfd\xbf\xbf\xbf\xbf!", 0}, 72 73 {"impossible bytes", "\xfe|\xff|\xfe\xfe\xff\xff!", 0}, 74 75 {"overlong sequences {u-2f}", 76 "\xc0\xaf|\xe0\x80\xaf|\xf0\x80\x80\xaf|\xf8\x80\x80\x80\xaf|\xfc\x80\x80\x80\x80\xaf!", 0}, 77 78 {"maximum overlong sequences", 79 "\xc1\xbf|\xe0\x9f\xbf|\xf0\x8f\xbf\xbf|\xf8\x87\xbf\xbf\xbf|\xfc\x83\xbf\xbf\xbf\xbf!", 0}, 80 81 {"overlong representation of the NUL character", 82 "\xc0\x80|\xe0\x80\x80|\xf0\x80\x80\x80|\xf8\x80\x80\x80\x80|\xfc\x80\x80\x80\x80\x80!", 0}, 83 84 {"single UTF-16 surrogates", 85 "\xed\xa0\x80|\xed\xad\xbf|\xed\xae\x80|\xed\xaf\xbf|\xed\xb0\x80|\xed\xbe\x80|\xed\xbf\xbf!", 0}, 86 87 {"paired UTF-16 surrogates", 88 "\xed\xa0\x80\xed\xb0\x80|\xed\xa0\x80\xed\xbf\xbf|\xed\xad\xbf\xed\xb0\x80|" 89 "\xed\xad\xbf\xed\xbf\xbf|\xed\xae\x80\xed\xb0\x80|\xed\xae\x80\xed\xbf\xbf|" 90 "\xed\xaf\xbf\xed\xb0\x80|\xed\xaf\xbf\xed\xbf\xbf!", 0}, 91 92 {"other illegal code positions", "\xef\xbf\xbe|\xef\xbf\xbf!", 0}, 93 94 {NULL, NULL, 0} 95 }; 96 97 int check_utf8_sequences(void) 98 { 99 yaml_parser_t *parser; 100 int failed = 0; 101 int k; 102 printf("checking utf-8 sequences...\n"); 103 for (k = 0; utf8_sequences[k].test; k++) { 104 char *title = utf8_sequences[k].title; 105 int check = utf8_sequences[k].result; 106 int result; 107 char *start = utf8_sequences[k].test; 108 char *end = start; 109 printf("\t%s:\n", title); 110 while(1) { 111 while (*end != '|' && *end != '!') end++; 112 parser = yaml_parser_new(); 113 assert(parser); 114 yaml_parser_set_input_string(parser, (unsigned char *)start, end-start); 115 result = yaml_parser_update_buffer(parser, end-start); 116 if (result != check) { 117 printf("\t\t- "); 118 failed ++; 119 } 120 else { 121 printf("\t\t+ "); 122 } 123 if (!parser->error) { 124 printf("(no error)\n"); 125 } 126 else if (parser->error == YAML_READER_ERROR) { 127 printf("(reader error: %s at %d)\n", parser->problem, parser->problem_offset); 128 } 129 if (*end == '!') break; 130 start = ++end; 131 yaml_parser_delete(parser); 132 }; 133 printf("\n"); 134 } 135 printf("checking utf-8 sequences: %d fail(s)\n", failed); 136 return failed; 137 } 138 139 7 140 int 8 141 main(void) 9 142 { 10 int major = -1; 11 int minor = -1; 12 int patch = -1; 13 char buf[64]; 14 15 yaml_get_version(&major, &minor, &patch); 16 sprintf(buf, "%d.%d.%d", major, minor, patch); 17 assert(strcmp(buf, yaml_get_version_string()) == 0); 18 19 return 0; 143 return check_utf8_sequences(); 20 144 }
