| 2 | | #define RAW_BUFFER_SIZE 16384 |
|---|
| 3 | | #define BUFFER_SIZE (RAW_BUFFER_SIZE*2) /* Should be enough for decoding |
|---|
| 4 | | the whole raw buffer. */ |
|---|
| | 2 | #if HAVE_CONFIG_H |
|---|
| | 3 | #include <config.h> |
|---|
| | 4 | #endif |
|---|
| | 5 | |
|---|
| | 6 | #include <yaml/yaml.h> |
|---|
| | 7 | |
|---|
| | 8 | #include <assert.h> |
|---|
| | 9 | |
|---|
| | 10 | /* Check for the UTF-16-BE BOM. */ |
|---|
| | 11 | #define IS_UTF16BE_BOM(pointer) ((pointer)[0] == 0xFE && (pointer)[1] == 0xFF) |
|---|
| | 12 | |
|---|
| | 13 | /* Check for the UTF-16-LE BOM. */ |
|---|
| | 14 | #define IS_UTF16LE_BOM(pointer) ((pointer)[0] == 0xFF && (pointer)[1] == 0xFE) |
|---|
| | 15 | |
|---|
| | 16 | /* Get a UTF-16-BE character. */ |
|---|
| | 17 | #define UTF16BE_CHAR(pointer) ((pointer)[0] << 8 + (pointer)[1]) |
|---|
| | 18 | |
|---|
| | 19 | /* Get a UTF-16-LE character. */ |
|---|
| | 20 | #define UTF16LE_CHAR(pointer) ((pointer)[0] + (pointer)[1] << 8) |
|---|
| | 21 | |
|---|
| | 22 | /* |
|---|
| | 23 | * From http://www.ietf.org/rfc/rfc3629.txt: |
|---|
| | 24 | * |
|---|
| | 25 | * Char. number range | UTF-8 octet sequence |
|---|
| | 26 | * (hexadecimal) | (binary) |
|---|
| | 27 | * --------------------+--------------------------------------------- |
|---|
| | 28 | * 0000 0000-0000 007F | 0xxxxxxx |
|---|
| | 29 | * 0000 0080-0000 07FF | 110xxxxx 10xxxxxx |
|---|
| | 30 | * 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx |
|---|
| | 31 | * 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
|---|
| | 32 | */ |
|---|
| | 33 | |
|---|
| | 34 | /* Get the length of a UTF-8 character (0 on error). */ |
|---|
| | 35 | #define UTF8_LENGTH(pointer) \ |
|---|
| | 36 | ((pointer)[0] < 0x80 ? 1 : \ |
|---|
| | 37 | (pointer)[0] < 0xC0 ? 0 : \ |
|---|
| | 38 | (pointer)[0] < 0xE0 ? 2 : \ |
|---|
| | 39 | (pointer)[0] < 0xF0 ? 3 : \ |
|---|
| | 40 | (pointer)[0] < 0xF8 ? 4 : 0) |
|---|
| | 41 | |
|---|
| | 42 | /* Get the value of the first byte of a UTF-8 sequence (0xFF on error). */ |
|---|
| | 43 | #define UTF8_FIRST_CHUNK(pointer) \ |
|---|
| | 44 | ((pointer)[0] < 0x80 ? (pointer)[0] & 0x7F : \ |
|---|
| | 45 | (pointer)[0] < 0xC0 ? 0xFF : \ |
|---|
| | 46 | (pointer)[0] < 0xE0 ? (pointer)[0] & 0x1F : \ |
|---|
| | 47 | (pointer)[0] < 0xF0 ? (pointer)[0] & 0x0F : \ |
|---|
| | 48 | (pointer)[0] < 0xF8 ? (pointer)[0] & 0x07 : 0xFF) |
|---|
| | 49 | |
|---|
| | 50 | /* Get the value of a non-first byte of a UTF-8 sequence (0xFF on error). */ |
|---|
| | 51 | #define UTF8_NEXT_CHUNK(pointer) \ |
|---|
| | 52 | ((pointer)[0] >= 0x80 && (pointer)[0] < 0xC0 ? (pointer)[0] & 0x3F : 0xFF) |
|---|
| | 53 | |
|---|
| | 54 | /* Determine the length of a UTF-8 character. */ |
|---|
| 19 | | /* First, let us check that the buffers are allocated. */ |
|---|
| 20 | | |
|---|
| 21 | | if (!parser->buffer) { |
|---|
| 22 | | parser->buffer = yaml_malloc(BUFFER_SIZE); |
|---|
| 23 | | if (!parser->buffer) { |
|---|
| 24 | | parser->error = YAML_MEMORY_ERROR; |
|---|
| 25 | | return 0; |
|---|
| 26 | | } |
|---|
| 27 | | parser->buffer_size = BUFFER_SIZE; |
|---|
| 28 | | parser->buffer_pointer = parser->buffer; |
|---|
| 29 | | parser->buffer_length = 0; |
|---|
| 30 | | } |
|---|
| 31 | | |
|---|
| 32 | | if (!parser->raw_buffer) { |
|---|
| 33 | | parser->raw_buffer = yaml_malloc(RAW_BUFFER_SIZE); |
|---|
| 34 | | if (!parser->raw_buffer) { |
|---|
| 35 | | parser->error = YAML_MEMORY_ERROR; |
|---|
| 36 | | return 0; |
|---|
| 37 | | } |
|---|
| 38 | | parser->raw_buffer_size = RAW_BUFFER_SIZE; |
|---|
| 39 | | } |
|---|
| 40 | | |
|---|
| 41 | | /* Next, determine the input encoding. */ |
|---|
| | 71 | /* Return if the buffer contains enough characters. */ |
|---|
| | 72 | |
|---|
| | 73 | if (parser->unread >= length) |
|---|
| | 74 | return 1; |
|---|
| | 75 | |
|---|
| | 76 | /* Determine the input encoding if it is not known yet. */ |
|---|
| 48 | | /* more... */ |
|---|
| | 83 | /* Move the unread characters to the beginning of the buffer. */ |
|---|
| | 84 | |
|---|
| | 85 | if (parser->buffer < parser->pointer |
|---|
| | 86 | && parser->pointer < parser->buffer_end) { |
|---|
| | 87 | size_t size = parser->buffer_end - parser->pointer; |
|---|
| | 88 | memmove(parser->buffer, parser->pointer, size); |
|---|
| | 89 | parser->pointer = parser->buffer; |
|---|
| | 90 | parser->buffer_end -= size; |
|---|
| | 91 | } |
|---|
| | 92 | else if (parser->pointer == parser->buffer_end) { |
|---|
| | 93 | parser->pointer = parser->buffer; |
|---|
| | 94 | parser->buffer_end = parser->buffer; |
|---|
| | 95 | } |
|---|
| | 96 | |
|---|
| | 97 | /* Fill the buffer until it has enough characters. */ |
|---|
| | 98 | |
|---|
| | 99 | while (parser->unread < length) |
|---|
| | 100 | { |
|---|
| | 101 | /* Fill the raw buffer. */ |
|---|
| | 102 | |
|---|
| | 103 | if (!yaml_parser_update_raw_buffer(parser)) return 0; |
|---|
| | 104 | |
|---|
| | 105 | /* Decode the raw buffer. */ |
|---|
| | 106 | |
|---|
| | 107 | while (parser->raw_unread) |
|---|
| | 108 | { |
|---|
| | 109 | unsigned int ch; |
|---|
| | 110 | int incomplete = 0; |
|---|
| | 111 | |
|---|
| | 112 | /* Decode the next character. */ |
|---|
| | 113 | |
|---|
| | 114 | switch (parser->encoding) |
|---|
| | 115 | { |
|---|
| | 116 | case YAML_UTF8_ENCODING: |
|---|
| | 117 | |
|---|
| | 118 | unsigned int utf8_length = UTF8_LENGTH(parser->raw_pointer); |
|---|
| | 119 | unsigned int utf8_chunk; |
|---|
| | 120 | |
|---|
| | 121 | /* Check if the raw buffer contains an incomplete character. */ |
|---|
| | 122 | |
|---|
| | 123 | if (utf8_length > parser->raw_unread) { |
|---|
| | 124 | if (parser->eof) { |
|---|
| | 125 | parser->error = YAML_READER_ERROR; |
|---|
| | 126 | return 0; |
|---|
| | 127 | } |
|---|
| | 128 | incomplete = 1; |
|---|
| | 129 | } |
|---|
| | 130 | |
|---|
| | 131 | /* Get the character checking it for validity. */ |
|---|
| | 132 | |
|---|
| | 133 | utf8_chunk = UTF8_FIRST_CHUNK(parser->raw_pointer ++); |
|---|
| | 134 | if (utf8_chunk == 0xFF) { |
|---|
| | 135 | parser->error = YAML_READER_ERROR; |
|---|
| | 136 | return 0; |
|---|
| | 137 | } |
|---|
| | 138 | ch = utf8_chunk; |
|---|
| | 139 | parser->raw_unread --; |
|---|
| | 140 | while (-- utf8_length) { |
|---|
| | 141 | utf8_chunk = UTF8_NEXT_CHUNK(parser->raw_pointer ++); |
|---|
| | 142 | if (utf8_chunk == 0xFF) { |
|---|
| | 143 | parser->error = YAML_READER_ERROR; |
|---|
| | 144 | return 0; |
|---|
| | 145 | } |
|---|
| | 146 | ch = ch << 6 + utf8_chunk; |
|---|
| | 147 | parser->raw_unread --; |
|---|
| | 148 | } |
|---|
| | 149 | |
|---|
| | 150 | break; |
|---|
| | 151 | |
|---|
| | 152 | case YAML_UTF16LE_ENCODING: |
|---|
| | 153 | |
|---|
| | 154 | /* Check if the raw buffer contains an incomplete character. */ |
|---|
| | 155 | |
|---|
| | 156 | if (parser->raw_unread < 2) { |
|---|
| | 157 | if (parser->eof) { |
|---|
| | 158 | parser->error = YAML_READER_ERROR; |
|---|
| | 159 | return 0; |
|---|
| | 160 | } |
|---|
| | 161 | incomplete = 1; |
|---|
| | 162 | } |
|---|
| | 163 | |
|---|
| | 164 | /* Get the current character. */ |
|---|
| | 165 | |
|---|
| | 166 | ch = UTF16LE_CHAR(parser->raw_pointer); |
|---|
| | 167 | parser->raw_pointer += 2; |
|---|
| | 168 | parser->raw_unread -= 2; |
|---|
| | 169 | |
|---|
| | 170 | break; |
|---|
| | 171 | |
|---|
| | 172 | case YAML_UTF16BE_ENCODING: |
|---|
| | 173 | |
|---|
| | 174 | /* Check if the raw buffer contains an incomplete character. */ |
|---|
| | 175 | |
|---|
| | 176 | if (parser->raw_unread < 2) { |
|---|
| | 177 | if (parser->eof) { |
|---|
| | 178 | parser->error = YAML_READER_ERROR; |
|---|
| | 179 | return 0; |
|---|
| | 180 | } |
|---|
| | 181 | incomplete = 1; |
|---|
| | 182 | } |
|---|
| | 183 | |
|---|
| | 184 | /* Get the current character. */ |
|---|
| | 185 | |
|---|
| | 186 | ch = UTF16BE_CHAR(parser->raw_pointer); |
|---|
| | 187 | parser->raw_pointer += 2; |
|---|
| | 188 | parser->raw_unread -= 2; |
|---|
| | 189 | |
|---|
| | 190 | break; |
|---|
| | 191 | } |
|---|
| | 192 | |
|---|
| | 193 | /* |
|---|
| | 194 | * Check if the character is in the allowed range: |
|---|
| | 195 | * #x9 | #xA | #xD | [#x20-#x7E] (8 bit) |
|---|
| | 196 | * | #x85 | [#xA0-#xD7FF] | [#xE000-#xFFFD] (16 bit) |
|---|
| | 197 | * | [#x10000-#x10FFFF] (32 bit) |
|---|
| | 198 | */ |
|---|
| | 199 | |
|---|
| | 200 | if (! (ch == 0x09 || ch == 0x0A || ch == 0x0D |
|---|
| | 201 | || (ch >= 0x20 && ch <= 0x7E) |
|---|
| | 202 | || (ch == 0x85) || (ch >= 0xA0 && ch <= 0xD7FF) |
|---|
| | 203 | || (ch >= 0xE000 && ch <= 0xFFFD) |
|---|
| | 204 | || (ch >= 0x10000 && ch <= 0x10FFFF))) { |
|---|
| | 205 | parser->error = YAML_READER_ERROR; |
|---|
| | 206 | return 0; |
|---|
| | 207 | } |
|---|
| | 208 | |
|---|
| | 209 | /* Finally put the character into the buffer. */ |
|---|
| | 210 | |
|---|
| | 211 | /* 0000 0000-0000 007F -> 0xxxxxxx */ |
|---|
| | 212 | if (ch <= 0x7F) { |
|---|
| | 213 | *(parser->buffer_end++) = ch; |
|---|
| | 214 | } |
|---|
| | 215 | /* 0000 0080-0000 07FF -> 110xxxxx 10xxxxxx */ |
|---|
| | 216 | else if (ch <= 0x7FF) { |
|---|
| | 217 | *(parser->buffer_end++) = 0xC0 + (ch >> 6) & 0x1F; |
|---|
| | 218 | *(parser->buffer_end++) = 0x80 + ch & 0x3F; |
|---|
| | 219 | } |
|---|
| | 220 | /* 0000 0800-0000 FFFF -> 1110xxxx 10xxxxxx 10xxxxxx */ |
|---|
| | 221 | else if (ch <= 0xFFFF) { |
|---|
| | 222 | *(parser->buffer_end++) = 0x80 + ch & 0x3F; |
|---|
| | 223 | *(parser->buffer_end++) = 0xC0 + (ch >> 6) & 0x1F; |
|---|
| | 224 | |
|---|
| | 225 | } |
|---|
| | 226 | /* 0001 0000-0010 FFFF -> 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ |
|---|
| | 227 | else { |
|---|
| | 228 | } |
|---|
| | 229 | } |
|---|
| | 230 | |
|---|
| | 231 | } |
|---|
| 52 | | |
|---|
| 53 | | |
|---|
| | 235 | /* |
|---|
| | 236 | * Determine the input stream encoding by checking the BOM symbol. If no BOM is |
|---|
| | 237 | * found, the UTF-8 encoding is assumed. Return 1 on success, 0 on failure. |
|---|
| | 238 | */ |
|---|
| | 239 | |
|---|
| | 240 | int |
|---|
| | 241 | yaml_parser_determine_encoding(yaml_parser_t *parser) |
|---|
| | 242 | { |
|---|
| | 243 | /* Ensure that we had enough bytes in the raw buffer. */ |
|---|
| | 244 | |
|---|
| | 245 | while (!parser->eof && parser->raw_unread < 2) { |
|---|
| | 246 | if (!yaml_parser_update_raw_buffer(parser)) { |
|---|
| | 247 | return 0; |
|---|
| | 248 | } |
|---|
| | 249 | } |
|---|
| | 250 | |
|---|
| | 251 | /* Determine the encoding. */ |
|---|
| | 252 | |
|---|
| | 253 | if (parser->raw_unread >= 2 && IS_UTF16BE_BOM(parser->raw_pointer)) { |
|---|
| | 254 | parser->encoding = YAML_UTF16BE_ENCODING; |
|---|
| | 255 | } |
|---|
| | 256 | else if (parser->raw_unread >= 2 && IS_UTF16LE_BOM(parser->raw_pointer)) { |
|---|
| | 257 | parser->encoding = YAML_UTF16LE_ENCODING; |
|---|
| | 258 | } |
|---|
| | 259 | else { |
|---|
| | 260 | parser->encoding = YAML_UTF8_ENCODING; |
|---|
| | 261 | } |
|---|
| | 262 | } |
|---|
| | 263 | |
|---|
| | 264 | /* |
|---|
| | 265 | * Update the raw buffer. |
|---|
| | 266 | */ |
|---|
| | 267 | |
|---|
| | 268 | int |
|---|
| | 269 | yaml_parser_update_raw_buffer(yaml_parser_t *parser) |
|---|
| | 270 | { |
|---|
| | 271 | size_t size_read = 0; |
|---|
| | 272 | |
|---|
| | 273 | /* Return if the raw buffer is full. */ |
|---|
| | 274 | |
|---|
| | 275 | if (parser->raw_unread == YAML_RAW_BUFFER_SIZE) return 1; |
|---|
| | 276 | |
|---|
| | 277 | /* Return on EOF. */ |
|---|
| | 278 | |
|---|
| | 279 | if (parser->eof) return 1; |
|---|
| | 280 | |
|---|
| | 281 | /* Move the remaining bytes in the raw buffer to the beginning. */ |
|---|
| | 282 | |
|---|
| | 283 | if (parser->raw_unread && parser->raw_buffer < parser->raw_pointer) { |
|---|
| | 284 | memmove(parser->raw_buffer, parser->raw_pointer, parser->raw_unread); |
|---|
| | 285 | } |
|---|
| | 286 | parser->raw_pointer = parser->raw_buffer; |
|---|
| | 287 | |
|---|
| | 288 | /* Call the read handler to fill the buffer. */ |
|---|
| | 289 | |
|---|
| | 290 | if (!parser->read_handler(parser->read_handler_data, |
|---|
| | 291 | parser->raw_buffer + parser->raw_unread, |
|---|
| | 292 | YAML_RAW_BUFFER_SIZE - parser->raw_unread, |
|---|
| | 293 | &size_read)) { |
|---|
| | 294 | parser->error = YAML_READER_ERROR; |
|---|
| | 295 | return 0; |
|---|
| | 296 | } |
|---|
| | 297 | parser->raw_unread += size_read; |
|---|
| | 298 | if (!size_read) { |
|---|
| | 299 | parser->eof = 1; |
|---|
| | 300 | } |
|---|
| | 301 | |
|---|
| | 302 | return 1; |
|---|
| | 303 | } |
|---|
| | 304 | |
|---|