Changeset 181 for libyaml/trunk/src/reader.c
- Timestamp:
- 05/30/06 13:49:18 (7 years ago)
- File:
-
- 1 edited
-
libyaml/trunk/src/reader.c (modified) (9 diffs)
Legend:
- Unmodified
- Added
- Removed
-
libyaml/trunk/src/reader.c
r180 r181 8 8 #include <assert.h> 9 9 10 /* Check for the UTF-16-BE BOM. */11 #define IS_UTF16BE_BOM(pointer) ((pointer)[0] == 0xFE && (pointer)[1] == 0xFF)12 13 /* Check for the UTF-16-LE BOM. */14 #define IS_UTF16LE_BOM(pointer) ((pointer)[0] == 0xFF && (pointer)[1] == 0xFE)15 16 /* Get a UTF-16-BE character. */17 #define UTF16BE_CHAR(pointer) ((pointer)[0] << 8 + (pointer)[1])18 19 /* Get a UTF-16-LE character. */20 #define UTF16LE_CHAR(pointer) ((pointer)[0] + (pointer)[1] << 8)21 22 10 /* 23 * From http://www.ietf.org/rfc/rfc3629.txt: 24 * 25 * Char. number range | UTF-8 octet sequence 26 * (hexadecimal) | (binary) 27 * --------------------+--------------------------------------------- 28 * 0000 0000-0000 007F | 0xxxxxxx 29 * 0000 0080-0000 07FF | 110xxxxx 10xxxxxx 30 * 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx 31 * 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 11 * Set the reader error and return 0. 32 12 */ 33 13 34 /* Get the length of a UTF-8 character (0 on error). */ 35 #define UTF8_LENGTH(pointer) \ 36 ((pointer)[0] < 0x80 ? 1 : \ 37 (pointer)[0] < 0xC0 ? 0 : \ 38 (pointer)[0] < 0xE0 ? 2 : \ 39 (pointer)[0] < 0xF0 ? 3 : \ 40 (pointer)[0] < 0xF8 ? 4 : 0) 41 42 /* Get the value of the first byte of a UTF-8 sequence (0xFF on error). */ 43 #define UTF8_FIRST_CHUNK(pointer) \ 44 ((pointer)[0] < 0x80 ? (pointer)[0] & 0x7F : \ 45 (pointer)[0] < 0xC0 ? 0xFF : \ 46 (pointer)[0] < 0xE0 ? (pointer)[0] & 0x1F : \ 47 (pointer)[0] < 0xF0 ? (pointer)[0] & 0x0F : \ 48 (pointer)[0] < 0xF8 ? (pointer)[0] & 0x07 : 0xFF) 49 50 /* Get the value of a non-first byte of a UTF-8 sequence (0xFF on error). */ 51 #define UTF8_NEXT_CHUNK(pointer) \ 52 ((pointer)[0] >= 0x80 && (pointer)[0] < 0xC0 ? (pointer)[0] & 0x3F : 0xFF) 53 54 /* Determine the length of a UTF-8 character. */ 14 int 15 yaml_parser_set_reader_error(yaml_parser_t *parser, const char *problem) 16 { 17 parser->error = YAML_READER_ERROR; 18 parser->problem = problem; 19 parser->problem_offset = parser->offset; 20 21 return 0; 22 } 23 55 24 56 25 /* … … 103 72 if (!yaml_parser_update_raw_buffer(parser)) return 0; 104 73 74 /* If the raw buffer is empty, it is EOF. */ 75 76 if (!parser->raw_unread) return 1; 77 105 78 /* Decode the raw buffer. */ 106 79 107 80 while (parser->raw_unread) 108 81 { 109 unsigned int ch;82 unsigned int value, value2; 110 83 int incomplete = 0; 84 unsigned char utf8_octet; 85 unsigned int utf8_length; 86 int k, low, high; 111 87 112 88 /* Decode the next character. */ … … 116 92 case YAML_UTF8_ENCODING: 117 93 118 unsigned int utf8_length = UTF8_LENGTH(parser->raw_pointer); 119 unsigned int utf8_chunk; 94 /* 95 * Decode a UTF-8 character. Check RFC 3629 96 * (http://www.ietf.org/rfc/rfc3629.txt) for more details. 97 * 98 * The following table (taken from the RFC) is used for 99 * decoding. 100 * 101 * Char. number range | UTF-8 octet sequence 102 * (hexadecimal) | (binary) 103 * --------------------+------------------------------------ 104 * 0000 0000-0000 007F | 0xxxxxxx 105 * 0000 0080-0000 07FF | 110xxxxx 10xxxxxx 106 * 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx 107 * 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 108 * 109 * Additionally, the characters in the range 0xD800-0xDFFF 110 * are prohibited as they are reserved for use with UTF-16 111 * surrogate pairs. 112 */ 113 114 /* Determine the length of the UTF-8 sequence. */ 115 116 utf8_octet = parser->raw_pointer[0]; 117 utf8_length = ( 118 (utf8_octet & 0x80) == 0x00 ? 1 : 119 (utf8_octet & 0xE0) == 0xC0 ? 2 : 120 (utf8_octet & 0xF0) == 0xE0 ? 3 : 121 (utf8_octet & 0xF8) == 0xF0 ? 4 : 0); 122 123 /* Check if the leading octet is valid. */ 124 125 if (!utf8_length) 126 return yaml_parser_set_reader_error(parser, 127 "Invalid leading UTF-8 octet"); 120 128 121 129 /* Check if the raw buffer contains an incomplete character. */ … … 123 131 if (utf8_length > parser->raw_unread) { 124 132 if (parser->eof) { 125 parser->error = YAML_READER_ERROR;126 return 0;133 return yaml_parser_set_reader_error(parser, 134 "Incomplete UTF-8 octet sequence"); 127 135 } 128 136 incomplete = 1; 137 break; 129 138 } 130 139 131 /* Get the character checking it for validity. */ 132 133 utf8_chunk = UTF8_FIRST_CHUNK(parser->raw_pointer ++); 134 if (utf8_chunk == 0xFF) { 135 parser->error = YAML_READER_ERROR; 136 return 0; 140 /* Decode the leading octet. */ 141 142 value = ( 143 (utf8_octet & 0x80) == 0x00 ? utf8_octet & 0x7F : 144 (utf8_octet & 0xE0) == 0xC0 ? utf8_octet & 0x1F : 145 (utf8_octet & 0xF0) == 0xE0 ? utf8_octet & 0x0F : 146 (utf8_octet & 0xF8) == 0xF0 ? utf8_octet & 0x07 : 0); 147 148 /* Check and decode the trailing octets. */ 149 150 for (k = 1; k < utf8_length; k ++) 151 { 152 utf8_octet = parser->raw_pointer[k]; 153 154 /* Check if the octet is valid. */ 155 156 if ((utf8_octet & 0xC0) != 0x80) 157 return yaml_parser_set_reader_error(parser, 158 "Invalid trailing UTF-8 octet"); 159 160 /* Decode the octet. */ 161 162 value = (value << 6) + (utf8_octet & 0x3F); 137 163 } 138 ch = utf8_chunk; 139 parser->raw_unread --; 140 while (-- utf8_length) { 141 utf8_chunk = UTF8_NEXT_CHUNK(parser->raw_pointer ++); 142 if (utf8_chunk == 0xFF) { 143 parser->error = YAML_READER_ERROR; 144 return 0; 145 } 146 ch = ch << 6 + utf8_chunk; 147 parser->raw_unread --; 148 } 164 165 /* Check the length of the sequence against the value. */ 166 167 if (!((utf8_length == 1) || 168 (utf8_length == 2 && value >= 0x80) || 169 (utf8_length == 3 && value >= 0x800) || 170 (utf8_length == 4 && value >= 0x10000))) 171 return yaml_parser_set_reader_error(parser, 172 "Invalid length of a UTF-8 sequence"); 173 174 /* Check the range of the value. */ 175 176 if ((value >= 0xD800 && value <= 0xDFFF) || value > 0x10FFFF) 177 return yaml_parser_set_reader_error(parser, 178 "Invalid Unicode character"); 179 180 parser->raw_pointer += utf8_length; 181 parser->raw_unread -= utf8_length; 182 parser->offset += utf8_length; 149 183 150 184 break; 151 185 152 186 case YAML_UTF16LE_ENCODING: 153 154 /* Check if the raw buffer contains an incomplete character. */ 187 case YAML_UTF16BE_ENCODING: 188 189 low = (parser->encoding == YAML_UTF16LE_ENCODING ? 0 : 1); 190 high = (parser->encoding == YAML_UTF16LE_ENCODING ? 1 : 0); 191 192 /* 193 * The UTF-16 encoding is not as simple as one might 194 * naively think. Check RFC 2781 195 * (http://www.ietf.org/rfc/rfc2781.txt). 196 * 197 * Normally, two subsequent bytes describe a Unicode 198 * character. However a special technique (called a 199 * surrogate pair) is used for specifying character 200 * values larger than 0xFFFF. 201 * 202 * A surrogate pair consists of two pseudo-characters: 203 * high surrogate area (0xD800-0xDBFF) 204 * low surrogate area (0xDC00-0xDFFF) 205 * 206 * The following formulas are used for decoding 207 * and encoding characters using surrogate pairs: 208 * 209 * U = U' + 0x10000 (0x01 00 00 <= U <= 0x10 FF FF) 210 * U' = yyyyyyyyyyxxxxxxxxxx (0 <= U' <= 0x0F FF FF) 211 * W1 = 110110yyyyyyyyyy 212 * W2 = 110111xxxxxxxxxx 213 * 214 * where U is the character value, W1 is the high surrogate 215 * area, W2 is the low surrogate area. 216 */ 217 218 /* Check for incomplete UTF-16 character. */ 155 219 156 220 if (parser->raw_unread < 2) { 157 221 if (parser->eof) { 158 parser->error = YAML_READER_ERROR;159 return 0;222 return yaml_parser_set_reader_error(parser, 223 "Incomplete UTF-16 character"); 160 224 } 161 225 incomplete = 1; 226 break; 162 227 } 163 228 164 /* Get the current character. */ 165 166 ch = UTF16LE_CHAR(parser->raw_pointer); 167 parser->raw_pointer += 2; 168 parser->raw_unread -= 2; 169 170 break; 171 172 case YAML_UTF16BE_ENCODING: 173 174 /* Check if the raw buffer contains an incomplete character. */ 175 176 if (parser->raw_unread < 2) { 177 if (parser->eof) { 178 parser->error = YAML_READER_ERROR; 179 return 0; 229 /* Get the character. */ 230 231 value = parser->raw_pointer[low] 232 + (parser->raw_pointer[high] << 8); 233 234 /* Check for unexpected low surrogate area. */ 235 236 if ((value & 0xFC00) == 0xDC00) 237 return yaml_parser_set_reader_error(parser, 238 "Unexpected low surrogate area"); 239 240 /* Check for a high surrogate area. */ 241 242 if ((value & 0xFC00) == 0xD800) { 243 244 /* Check for incomplete surrogate pair. */ 245 246 if (parser->raw_unread < 4) { 247 if (parser->eof) { 248 return yaml_parser_set_reader_error(parser, 249 "Incomplete UTF-16 surrogate pair"); 250 } 251 incomplete = 1; 252 break; 180 253 } 181 incomplete = 1; 254 255 /* Get the next character. */ 256 257 unsigned int value2 = parser->raw_pointer[low+2] 258 + (parser->raw_pointer[high+2] << 8); 259 260 /* Check for a low surrogate area. */ 261 262 if ((value2 & 0xFC00) != 0xDC00) 263 return yaml_parser_set_reader_error(parser, 264 "Expected low surrogate area"); 265 266 /* Generate the value of the surrogate pair. */ 267 268 value = 0x10000 + ((value & 0x3FF) << 10) + (value2 & 0x3FF); 269 270 parser->raw_pointer += 4; 271 parser->raw_unread -= 4; 272 parser->offset += 4; 182 273 } 183 274 184 /* Get the current character. */185 186 ch = UTF16BE_CHAR(parser->raw_pointer);187 parser->raw_pointer += 2;188 parser->raw_unread -= 2;275 else { 276 parser->raw_pointer += 2; 277 parser->raw_unread -= 2; 278 parser->offset += 4; 279 } 189 280 190 281 break; 191 282 } 283 284 /* Check if the raw buffer contains enough bytes to form a character. */ 285 286 if (incomplete) break; 192 287 193 288 /* … … 198 293 */ 199 294 200 if (! (ch == 0x09 || ch == 0x0A || ch == 0x0D 201 || (ch >= 0x20 && ch <= 0x7E) 202 || (ch == 0x85) || (ch >= 0xA0 && ch <= 0xD7FF) 203 || (ch >= 0xE000 && ch <= 0xFFFD) 204 || (ch >= 0x10000 && ch <= 0x10FFFF))) { 205 parser->error = YAML_READER_ERROR; 206 return 0; 207 } 295 if (! (value == 0x09 || value == 0x0A || value == 0x0D 296 || (value >= 0x20 && value <= 0x7E) 297 || (value == 0x85) || (value >= 0xA0 && value <= 0xD7FF) 298 || (value >= 0xE000 && value <= 0xFFFD) 299 || (value >= 0x10000 && value <= 0x10FFFF))) 300 return yaml_parser_set_reader_error(parser, 301 "Control characters are not allowed"); 208 302 209 303 /* Finally put the character into the buffer. */ 210 304 211 305 /* 0000 0000-0000 007F -> 0xxxxxxx */ 212 if ( ch<= 0x7F) {213 *(parser->buffer_end++) = ch;306 if (value <= 0x7F) { 307 *(parser->buffer_end++) = value; 214 308 } 215 309 /* 0000 0080-0000 07FF -> 110xxxxx 10xxxxxx */ 216 else if ( ch<= 0x7FF) {217 *(parser->buffer_end++) = 0xC0 + ( ch >> 6) & 0x1F;218 *(parser->buffer_end++) = 0x80 + ch& 0x3F;310 else if (value <= 0x7FF) { 311 *(parser->buffer_end++) = 0xC0 + (value >> 6); 312 *(parser->buffer_end++) = 0x80 + value & 0x3F; 219 313 } 220 314 /* 0000 0800-0000 FFFF -> 1110xxxx 10xxxxxx 10xxxxxx */ 221 else if ( ch<= 0xFFFF) {222 *(parser->buffer_end++) = 0x 80 + ch & 0x3F;223 *(parser->buffer_end++) = 0x C0 + (ch >> 6) & 0x1F;224 315 else if (value <= 0xFFFF) { 316 *(parser->buffer_end++) = 0xE0 + (value >> 12); 317 *(parser->buffer_end++) = 0x80 + (value >> 6) & 0x3F; 318 *(parser->buffer_end++) = 0x80 + value & 0x3F; 225 319 } 226 320 /* 0001 0000-0010 FFFF -> 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ 227 321 else { 322 *(parser->buffer_end++) = 0xF0 + (value >> 18); 323 *(parser->buffer_end++) = 0x80 + (value >> 12) & 0x3F; 324 *(parser->buffer_end++) = 0x80 + (value >> 6) & 0x3F; 325 *(parser->buffer_end++) = 0x80 + value & 0x3F; 228 326 } 229 327 } 230 231 } 232 328 } 329 330 return 1; 233 331 } 234 332 … … 238 336 */ 239 337 338 #define BOM_UTF8 "\xef\xbb\xbf" 339 #define BOM_UTF16LE "\xff\xfe" 340 #define BOM_UTF16BE "\xfe\xff" 341 240 342 int 241 343 yaml_parser_determine_encoding(yaml_parser_t *parser) … … 243 345 /* Ensure that we had enough bytes in the raw buffer. */ 244 346 245 while (!parser->eof && parser->raw_unread < 2) {347 while (!parser->eof && parser->raw_unread < 3) { 246 348 if (!yaml_parser_update_raw_buffer(parser)) { 247 349 return 0; … … 251 353 /* Determine the encoding. */ 252 354 253 if (parser->raw_unread >= 2 && IS_UTF16BE_BOM(parser->raw_pointer)) { 355 if (parser->raw_unread >= 2 356 && !memcmp(parser->raw_pointer, BOM_UTF16LE, 2)) { 357 parser->encoding = YAML_UTF16LE_ENCODING; 358 parser->raw_pointer += 2; 359 parser->raw_unread -= 2; 360 } 361 else if (parser->raw_unread >= 2 362 && !memcmp(parser->raw_pointer, BOM_UTF16BE, 2)) { 254 363 parser->encoding = YAML_UTF16BE_ENCODING; 255 } 256 else if (parser->raw_unread >= 2 && IS_UTF16LE_BOM(parser->raw_pointer)) { 257 parser->encoding = YAML_UTF16LE_ENCODING; 364 parser->raw_pointer += 2; 365 parser->raw_unread -= 2; 366 } 367 else if (parser->raw_unread >= 3 368 && !memcmp(parser->raw_pointer, BOM_UTF8, 3)) { 369 parser->encoding = YAML_UTF8_ENCODING; 370 parser->raw_pointer += 3; 371 parser->raw_unread -= 3; 258 372 } 259 373 else { 260 374 parser->encoding = YAML_UTF8_ENCODING; 261 375 } 376 377 return 1; 262 378 } 263 379 … … 292 408 YAML_RAW_BUFFER_SIZE - parser->raw_unread, 293 409 &size_read)) { 294 parser->error = YAML_READER_ERROR; 295 return 0; 410 return yaml_parser_set_reader_error(parser, "Input error"); 296 411 } 297 412 parser->raw_unread += size_read;
Note: See TracChangeset
for help on using the changeset viewer.
