| 1 |
#include <yaml.h> |
|---|
| 2 |
|
|---|
| 3 |
YAML_DECLARE(int) |
|---|
| 4 |
yaml_parser_update_buffer(yaml_parser_t *parser, size_t length); |
|---|
| 5 |
|
|---|
| 6 |
#include <stdlib.h> |
|---|
| 7 |
#include <stdio.h> |
|---|
| 8 |
|
|---|
| 9 |
#ifdef NDEBUG |
|---|
| 10 |
#undef NDEBUG |
|---|
| 11 |
#endif |
|---|
| 12 |
#include <assert.h> |
|---|
| 13 |
|
|---|
| 14 |
/* |
|---|
| 15 |
* Test cases are stolen from |
|---|
| 16 |
* http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt |
|---|
| 17 |
*/ |
|---|
| 18 |
|
|---|
| 19 |
typedef struct { |
|---|
| 20 |
char *title; |
|---|
| 21 |
char *test; |
|---|
| 22 |
int result; |
|---|
| 23 |
} test_case; |
|---|
| 24 |
|
|---|
| 25 |
test_case utf8_sequences[] = { |
|---|
| 26 |
/* {"title", "test 1|test 2|...|test N!", (0 or 1)}, */ |
|---|
| 27 |
|
|---|
| 28 |
{"a simple test", "'test' is '\xd0\xbf\xd1\x80\xd0\xbe\xd0\xb2\xd0\xb5\xd1\x80\xd0\xba\xd0\xb0' in Russian!", 1}, |
|---|
| 29 |
{"an empty line", "!", 1}, |
|---|
| 30 |
|
|---|
| 31 |
{"u-0 is a control character", "\x00!", 0}, |
|---|
| 32 |
{"u-80 is a control character", "\xc2\x80!", 0}, |
|---|
| 33 |
{"u-800 is valid", "\xe0\xa0\x80!", 1}, |
|---|
| 34 |
{"u-10000 is valid", "\xf0\x90\x80\x80!", 1}, |
|---|
| 35 |
{"5 bytes sequences are not allowed", "\xf8\x88\x80\x80\x80!", 0}, |
|---|
| 36 |
{"6 bytes sequences are not allowed", "\xfc\x84\x80\x80\x80\x80!", 0}, |
|---|
| 37 |
|
|---|
| 38 |
{"u-7f is a control character", "\x7f!", 0}, |
|---|
| 39 |
{"u-7FF is valid", "\xdf\xbf!", 1}, |
|---|
| 40 |
{"u-FFFF is a control character", "\xef\xbf\xbf!", 0}, |
|---|
| 41 |
{"u-1FFFFF is too large", "\xf7\xbf\xbf\xbf!", 0}, |
|---|
| 42 |
{"u-3FFFFFF is 5 bytes", "\xfb\xbf\xbf\xbf\xbf!", 0}, |
|---|
| 43 |
{"u-7FFFFFFF is 6 bytes", "\xfd\xbf\xbf\xbf\xbf\xbf!", 0}, |
|---|
| 44 |
|
|---|
| 45 |
{"u-D7FF", "\xed\x9f\xbf!", 1}, |
|---|
| 46 |
{"u-E000", "\xee\x80\x80!", 1}, |
|---|
| 47 |
{"u-FFFD", "\xef\xbf\xbd!", 1}, |
|---|
| 48 |
{"u-10FFFF", "\xf4\x8f\xbf\xbf!", 1}, |
|---|
| 49 |
{"u-110000", "\xf4\x90\x80\x80!", 0}, |
|---|
| 50 |
|
|---|
| 51 |
{"first continuation byte", "\x80!", 0}, |
|---|
| 52 |
{"last continuation byte", "\xbf!", 0}, |
|---|
| 53 |
|
|---|
| 54 |
{"2 continuation bytes", "\x80\xbf!", 0}, |
|---|
| 55 |
{"3 continuation bytes", "\x80\xbf\x80!", 0}, |
|---|
| 56 |
{"4 continuation bytes", "\x80\xbf\x80\xbf!", 0}, |
|---|
| 57 |
{"5 continuation bytes", "\x80\xbf\x80\xbf\x80!", 0}, |
|---|
| 58 |
{"6 continuation bytes", "\x80\xbf\x80\xbf\x80\xbf!", 0}, |
|---|
| 59 |
{"7 continuation bytes", "\x80\xbf\x80\xbf\x80\xbf\x80!", 0}, |
|---|
| 60 |
|
|---|
| 61 |
{"sequence of all 64 possible continuation bytes", |
|---|
| 62 |
"\x80|\x81|\x82|\x83|\x84|\x85|\x86|\x87|\x88|\x89|\x8a|\x8b|\x8c|\x8d|\x8e|\x8f|" |
|---|
| 63 |
"\x90|\x91|\x92|\x93|\x94|\x95|\x96|\x97|\x98|\x99|\x9a|\x9b|\x9c|\x9d|\x9e|\x9f|" |
|---|
| 64 |
"\xa0|\xa1|\xa2|\xa3|\xa4|\xa5|\xa6|\xa7|\xa8|\xa9|\xaa|\xab|\xac|\xad|\xae|\xaf|" |
|---|
| 65 |
"\xb0|\xb1|\xb2|\xb3|\xb4|\xb5|\xb6|\xb7|\xb8|\xb9|\xba|\xbb|\xbc|\xbd|\xbe|\xbf!", 0}, |
|---|
| 66 |
{"32 first bytes of 2-byte sequences {0xc0-0xdf}", |
|---|
| 67 |
"\xc0 |\xc1 |\xc2 |\xc3 |\xc4 |\xc5 |\xc6 |\xc7 |\xc8 |\xc9 |\xca |\xcb |\xcc |\xcd |\xce |\xcf |" |
|---|
| 68 |
"\xd0 |\xd1 |\xd2 |\xd3 |\xd4 |\xd5 |\xd6 |\xd7 |\xd8 |\xd9 |\xda |\xdb |\xdc |\xdd |\xde |\xdf !", 0}, |
|---|
| 69 |
{"16 first bytes of 3-byte sequences {0xe0-0xef}", |
|---|
| 70 |
"\xe0 |\xe1 |\xe2 |\xe3 |\xe4 |\xe5 |\xe6 |\xe7 |\xe8 |\xe9 |\xea |\xeb |\xec |\xed |\xee |\xef !", 0}, |
|---|
| 71 |
{"8 first bytes of 4-byte sequences {0xf0-0xf7}", "\xf0 |\xf1 |\xf2 |\xf3 |\xf4 |\xf5 |\xf6 |\xf7 !", 0}, |
|---|
| 72 |
{"4 first bytes of 5-byte sequences {0xf8-0xfb}", "\xf8 |\xf9 |\xfa |\xfb !", 0}, |
|---|
| 73 |
{"2 first bytes of 6-byte sequences {0xfc-0xfd}", "\xfc |\xfd !", 0}, |
|---|
| 74 |
|
|---|
| 75 |
{"sequences with last byte missing {u-0}", |
|---|
| 76 |
"\xc0|\xe0\x80|\xf0\x80\x80|\xf8\x80\x80\x80|\xfc\x80\x80\x80\x80!", 0}, |
|---|
| 77 |
{"sequences with last byte missing {u-...FF}", |
|---|
| 78 |
"\xdf|\xef\xbf|\xf7\xbf\xbf|\xfb\xbf\xbf\xbf|\xfd\xbf\xbf\xbf\xbf!", 0}, |
|---|
| 79 |
|
|---|
| 80 |
{"impossible bytes", "\xfe|\xff|\xfe\xfe\xff\xff!", 0}, |
|---|
| 81 |
|
|---|
| 82 |
{"overlong sequences {u-2f}", |
|---|
| 83 |
"\xc0\xaf|\xe0\x80\xaf|\xf0\x80\x80\xaf|\xf8\x80\x80\x80\xaf|\xfc\x80\x80\x80\x80\xaf!", 0}, |
|---|
| 84 |
|
|---|
| 85 |
{"maximum overlong sequences", |
|---|
| 86 |
"\xc1\xbf|\xe0\x9f\xbf|\xf0\x8f\xbf\xbf|\xf8\x87\xbf\xbf\xbf|\xfc\x83\xbf\xbf\xbf\xbf!", 0}, |
|---|
| 87 |
|
|---|
| 88 |
{"overlong representation of the NUL character", |
|---|
| 89 |
"\xc0\x80|\xe0\x80\x80|\xf0\x80\x80\x80|\xf8\x80\x80\x80\x80|\xfc\x80\x80\x80\x80\x80!", 0}, |
|---|
| 90 |
|
|---|
| 91 |
{"single UTF-16 surrogates", |
|---|
| 92 |
"\xed\xa0\x80|\xed\xad\xbf|\xed\xae\x80|\xed\xaf\xbf|\xed\xb0\x80|\xed\xbe\x80|\xed\xbf\xbf!", 0}, |
|---|
| 93 |
|
|---|
| 94 |
{"paired UTF-16 surrogates", |
|---|
| 95 |
"\xed\xa0\x80\xed\xb0\x80|\xed\xa0\x80\xed\xbf\xbf|\xed\xad\xbf\xed\xb0\x80|" |
|---|
| 96 |
"\xed\xad\xbf\xed\xbf\xbf|\xed\xae\x80\xed\xb0\x80|\xed\xae\x80\xed\xbf\xbf|" |
|---|
| 97 |
"\xed\xaf\xbf\xed\xb0\x80|\xed\xaf\xbf\xed\xbf\xbf!", 0}, |
|---|
| 98 |
|
|---|
| 99 |
{"other illegal code positions", "\xef\xbf\xbe|\xef\xbf\xbf!", 0}, |
|---|
| 100 |
|
|---|
| 101 |
{NULL, NULL, 0} |
|---|
| 102 |
}; |
|---|
| 103 |
|
|---|
| 104 |
test_case boms[] = { |
|---|
| 105 |
|
|---|
| 106 |
/* {"title", "test!", lenth}, */ |
|---|
| 107 |
|
|---|
| 108 |
{"no bom (utf-8)", "Hi is \xd0\x9f\xd1\x80\xd0\xb8\xd0\xb2\xd0\xb5\xd1\x82!", 13}, |
|---|
| 109 |
{"bom (utf-8)", "\xef\xbb\xbfHi is \xd0\x9f\xd1\x80\xd0\xb8\xd0\xb2\xd0\xb5\xd1\x82!", 13}, |
|---|
| 110 |
{"bom (utf-16-le)", "\xff\xfeH\x00i\x00 \x00i\x00s\x00 \x00\x1f\x04@\x04""8\x04""2\x04""5\x04""B\x04!", 13}, |
|---|
| 111 |
{"bom (utf-16-be)", "\xfe\xff\x00H\x00i\x00 \x00i\x00s\x00 \x04\x1f\x04@\x04""8\x04""2\x04""5\x04""B!", 13}, |
|---|
| 112 |
{NULL, NULL, 0} |
|---|
| 113 |
}; |
|---|
| 114 |
|
|---|
| 115 |
char *bom_original = "Hi is \xd0\x9f\xd1\x80\xd0\xb8\xd0\xb2\xd0\xb5\xd1\x82"; |
|---|
| 116 |
|
|---|
| 117 |
int check_utf8_sequences(void) |
|---|
| 118 |
{ |
|---|
| 119 |
yaml_parser_t parser; |
|---|
| 120 |
int failed = 0; |
|---|
| 121 |
int k; |
|---|
| 122 |
printf("checking utf-8 sequences...\n"); |
|---|
| 123 |
for (k = 0; utf8_sequences[k].test; k++) { |
|---|
| 124 |
char *title = utf8_sequences[k].title; |
|---|
| 125 |
int check = utf8_sequences[k].result; |
|---|
| 126 |
int result; |
|---|
| 127 |
char *start = utf8_sequences[k].test; |
|---|
| 128 |
char *end = start; |
|---|
| 129 |
printf("\t%s:\n", title); |
|---|
| 130 |
while(1) { |
|---|
| 131 |
while (*end != '|' && *end != '!') end++; |
|---|
| 132 |
yaml_parser_initialize(&parser); |
|---|
| 133 |
yaml_parser_set_input_string(&parser, (unsigned char *)start, end-start); |
|---|
| 134 |
result = yaml_parser_update_buffer(&parser, end-start); |
|---|
| 135 |
if (result != check) { |
|---|
| 136 |
printf("\t\t- "); |
|---|
| 137 |
failed ++; |
|---|
| 138 |
} |
|---|
| 139 |
else { |
|---|
| 140 |
printf("\t\t+ "); |
|---|
| 141 |
} |
|---|
| 142 |
if (!parser.error) { |
|---|
| 143 |
printf("(no error)\n"); |
|---|
| 144 |
} |
|---|
| 145 |
else if (parser.error == YAML_READER_ERROR) { |
|---|
| 146 |
if (parser.problem_value != -1) { |
|---|
| 147 |
printf("(reader error: %s: #%X at %d)\n", |
|---|
| 148 |
parser.problem, parser.problem_value, parser.problem_offset); |
|---|
| 149 |
} |
|---|
| 150 |
else { |
|---|
| 151 |
printf("(reader error: %s at %d)\n", |
|---|
| 152 |
parser.problem, parser.problem_offset); |
|---|
| 153 |
} |
|---|
| 154 |
} |
|---|
| 155 |
if (*end == '!') break; |
|---|
| 156 |
start = ++end; |
|---|
| 157 |
yaml_parser_delete(&parser); |
|---|
| 158 |
}; |
|---|
| 159 |
printf("\n"); |
|---|
| 160 |
} |
|---|
| 161 |
printf("checking utf-8 sequences: %d fail(s)\n", failed); |
|---|
| 162 |
return failed; |
|---|
| 163 |
} |
|---|
| 164 |
|
|---|
| 165 |
int check_boms(void) |
|---|
| 166 |
{ |
|---|
| 167 |
yaml_parser_t parser; |
|---|
| 168 |
int failed = 0; |
|---|
| 169 |
int k; |
|---|
| 170 |
printf("checking boms...\n"); |
|---|
| 171 |
for (k = 0; boms[k].test; k++) { |
|---|
| 172 |
char *title = boms[k].title; |
|---|
| 173 |
int check = boms[k].result; |
|---|
| 174 |
int result; |
|---|
| 175 |
char *start = boms[k].test; |
|---|
| 176 |
char *end = start; |
|---|
| 177 |
while (*end != '!') end++; |
|---|
| 178 |
printf("\t%s: ", title); |
|---|
| 179 |
yaml_parser_initialize(&parser); |
|---|
| 180 |
yaml_parser_set_input_string(&parser, (unsigned char *)start, end-start); |
|---|
| 181 |
result = yaml_parser_update_buffer(&parser, end-start); |
|---|
| 182 |
if (!result) { |
|---|
| 183 |
printf("- (reader error: %s at %d)\n", parser.problem, parser.problem_offset); |
|---|
| 184 |
failed++; |
|---|
| 185 |
} |
|---|
| 186 |
else { |
|---|
| 187 |
if (parser.unread != check) { |
|---|
| 188 |
printf("- (length=%d while expected length=%d)\n", parser.unread, check); |
|---|
| 189 |
failed++; |
|---|
| 190 |
} |
|---|
| 191 |
else if (memcmp(parser.buffer.start, bom_original, check) != 0) { |
|---|
| 192 |
printf("- (value '%s' does not equal to the original value '%s')\n", parser.buffer.start, bom_original); |
|---|
| 193 |
failed++; |
|---|
| 194 |
} |
|---|
| 195 |
else { |
|---|
| 196 |
printf("+\n"); |
|---|
| 197 |
} |
|---|
| 198 |
} |
|---|
| 199 |
yaml_parser_delete(&parser); |
|---|
| 200 |
} |
|---|
| 201 |
printf("checking boms: %d fail(s)\n", failed); |
|---|
| 202 |
return failed; |
|---|
| 203 |
} |
|---|
| 204 |
|
|---|
| 205 |
#define LONG 100000 |
|---|
| 206 |
|
|---|
| 207 |
int check_long_utf8(void) |
|---|
| 208 |
{ |
|---|
| 209 |
yaml_parser_t parser; |
|---|
| 210 |
int k = 0; |
|---|
| 211 |
int j; |
|---|
| 212 |
int failed = 0; |
|---|
| 213 |
unsigned char ch0, ch1; |
|---|
| 214 |
unsigned char *buffer = malloc(3+LONG*2); |
|---|
| 215 |
assert(buffer); |
|---|
| 216 |
printf("checking a long utf8 sequence...\n"); |
|---|
| 217 |
buffer[k++] = '\xef'; |
|---|
| 218 |
buffer[k++] = '\xbb'; |
|---|
| 219 |
buffer[k++] = '\xbf'; |
|---|
| 220 |
for (j = 0; j < LONG; j ++) { |
|---|
| 221 |
if (j % 2) { |
|---|
| 222 |
buffer[k++] = '\xd0'; |
|---|
| 223 |
buffer[k++] = '\x90'; |
|---|
| 224 |
} |
|---|
| 225 |
else { |
|---|
| 226 |
buffer[k++] = '\xd0'; |
|---|
| 227 |
buffer[k++] = '\xaf'; |
|---|
| 228 |
} |
|---|
| 229 |
} |
|---|
| 230 |
yaml_parser_initialize(&parser); |
|---|
| 231 |
yaml_parser_set_input_string(&parser, buffer, 3+LONG*2); |
|---|
| 232 |
for (k = 0; k < LONG; k++) { |
|---|
| 233 |
if (!parser.unread) { |
|---|
| 234 |
if (!yaml_parser_update_buffer(&parser, 1)) { |
|---|
| 235 |
printf("\treader error: %s at %d\n", parser.problem, parser.problem_offset); |
|---|
| 236 |
failed = 1; |
|---|
| 237 |
break; |
|---|
| 238 |
} |
|---|
| 239 |
} |
|---|
| 240 |
if (!parser.unread) { |
|---|
| 241 |
printf("\tnot enough characters at %d\n", k); |
|---|
| 242 |
failed = 1; |
|---|
| 243 |
break; |
|---|
| 244 |
} |
|---|
| 245 |
if (k % 2) { |
|---|
| 246 |
ch0 = '\xd0'; |
|---|
| 247 |
ch1 = '\x90'; |
|---|
| 248 |
} |
|---|
| 249 |
else { |
|---|
| 250 |
ch0 = '\xd0'; |
|---|
| 251 |
ch1 = '\xaf'; |
|---|
| 252 |
} |
|---|
| 253 |
if (parser.buffer.pointer[0] != ch0 || parser.buffer.pointer[1] != ch1) { |
|---|
| 254 |
printf("\tincorrect UTF-8 sequence: %X %X instead of %X %X\n", |
|---|
| 255 |
(int)parser.buffer.pointer[0], (int)parser.buffer.pointer[1], |
|---|
| 256 |
(int)ch0, (int)ch1); |
|---|
| 257 |
failed = 1; |
|---|
| 258 |
break; |
|---|
| 259 |
} |
|---|
| 260 |
parser.buffer.pointer += 2; |
|---|
| 261 |
parser.unread -= 1; |
|---|
| 262 |
} |
|---|
| 263 |
if (!failed) { |
|---|
| 264 |
if (!yaml_parser_update_buffer(&parser, 1)) { |
|---|
| 265 |
printf("\treader error: %s at %d\n", parser.problem, parser.problem_offset); |
|---|
| 266 |
failed = 1; |
|---|
| 267 |
} |
|---|
| 268 |
else if (parser.buffer.pointer[0] != '\0') { |
|---|
| 269 |
printf("\texpected NUL, found %X (eof=%d, unread=%d)\n", (int)parser.buffer.pointer[0], parser.eof, parser.unread); |
|---|
| 270 |
failed = 1; |
|---|
| 271 |
} |
|---|
| 272 |
} |
|---|
| 273 |
yaml_parser_delete(&parser); |
|---|
| 274 |
free(buffer); |
|---|
| 275 |
printf("checking a long utf8 sequence: %d fail(s)\n", failed); |
|---|
| 276 |
return failed; |
|---|
| 277 |
} |
|---|
| 278 |
|
|---|
| 279 |
int check_long_utf16(void) |
|---|
| 280 |
{ |
|---|
| 281 |
yaml_parser_t parser; |
|---|
| 282 |
int k = 0; |
|---|
| 283 |
int j; |
|---|
| 284 |
int failed = 0; |
|---|
| 285 |
unsigned char ch0, ch1; |
|---|
| 286 |
unsigned char *buffer = malloc(2+LONG*2); |
|---|
| 287 |
assert(buffer); |
|---|
| 288 |
printf("checking a long utf16 sequence...\n"); |
|---|
| 289 |
buffer[k++] = '\xff'; |
|---|
| 290 |
buffer[k++] = '\xfe'; |
|---|
| 291 |
for (j = 0; j < LONG; j ++) { |
|---|
| 292 |
if (j % 2) { |
|---|
| 293 |
buffer[k++] = '\x10'; |
|---|
| 294 |
buffer[k++] = '\x04'; |
|---|
| 295 |
} |
|---|
| 296 |
else { |
|---|
| 297 |
buffer[k++] = '/'; |
|---|
| 298 |
buffer[k++] = '\x04'; |
|---|
| 299 |
} |
|---|
| 300 |
} |
|---|
| 301 |
yaml_parser_initialize(&parser); |
|---|
| 302 |
yaml_parser_set_input_string(&parser, buffer, 2+LONG*2); |
|---|
| 303 |
for (k = 0; k < LONG; k++) { |
|---|
| 304 |
if (!parser.unread) { |
|---|
| 305 |
if (!yaml_parser_update_buffer(&parser, 1)) { |
|---|
| 306 |
printf("\treader error: %s at %d\n", parser.problem, parser.problem_offset); |
|---|
| 307 |
failed = 1; |
|---|
| 308 |
break; |
|---|
| 309 |
} |
|---|
| 310 |
} |
|---|
| 311 |
if (!parser.unread) { |
|---|
| 312 |
printf("\tnot enough characters at %d\n", k); |
|---|
| 313 |
failed = 1; |
|---|
| 314 |
break; |
|---|
| 315 |
} |
|---|
| 316 |
if (k % 2) { |
|---|
| 317 |
ch0 = '\xd0'; |
|---|
| 318 |
ch1 = '\x90'; |
|---|
| 319 |
} |
|---|
| 320 |
else { |
|---|
| 321 |
ch0 = '\xd0'; |
|---|
| 322 |
ch1 = '\xaf'; |
|---|
| 323 |
} |
|---|
| 324 |
if (parser.buffer.pointer[0] != ch0 || parser.buffer.pointer[1] != ch1) { |
|---|
| 325 |
printf("\tincorrect UTF-8 sequence: %X %X instead of %X %X\n", |
|---|
| 326 |
(int)parser.buffer.pointer[0], (int)parser.buffer.pointer[1], |
|---|
| 327 |
(int)ch0, (int)ch1); |
|---|
| 328 |
failed = 1; |
|---|
| 329 |
break; |
|---|
| 330 |
} |
|---|
| 331 |
parser.buffer.pointer += 2; |
|---|
| 332 |
parser.unread -= 1; |
|---|
| 333 |
} |
|---|
| 334 |
if (!failed) { |
|---|
| 335 |
if (!yaml_parser_update_buffer(&parser, 1)) { |
|---|
| 336 |
printf("\treader error: %s at %d\n", parser.problem, parser.problem_offset); |
|---|
| 337 |
failed = 1; |
|---|
| 338 |
} |
|---|
| 339 |
else if (parser.buffer.pointer[0] != '\0') { |
|---|
| 340 |
printf("\texpected NUL, found %X (eof=%d, unread=%d)\n", (int)parser.buffer.pointer[0], parser.eof, parser.unread); |
|---|
| 341 |
failed = 1; |
|---|
| 342 |
} |
|---|
| 343 |
} |
|---|
| 344 |
yaml_parser_delete(&parser); |
|---|
| 345 |
free(buffer); |
|---|
| 346 |
printf("checking a long utf16 sequence: %d fail(s)\n", failed); |
|---|
| 347 |
return failed; |
|---|
| 348 |
} |
|---|
| 349 |
|
|---|
| 350 |
int |
|---|
| 351 |
main(void) |
|---|
| 352 |
{ |
|---|
| 353 |
return check_utf8_sequences() + check_boms() + check_long_utf8() + check_long_utf16(); |
|---|
| 354 |
} |
|---|