source: libyaml/trunk/src/scanner.c @ 184

Revision 184, 17.4 KB checked in by xi, 8 years ago (diff)

Add scanner definitions.

Add a basic description of all tokens produced by the scanner.

Line 
1
2/*
3 * Introduction
4 * ************
5 *
6 * The following notes assume that you are familiar with the YAML specification
7 * (http://yaml.org/spec/cvs/current.html).  We mostly follow it, although in
8 * some cases we are less restrictive that it requires.
9 *
10 * The process of transforming a YAML stream into a sequence of events is
11 * divided on two steps: Scanning and Parsing.
12 *
13 * The Scanner transforms the input stream into a sequence of tokens, while the
14 * parser transform the sequence of tokens produced by the Scanner into a
15 * sequence of parsing events.
16 *
17 * The Scanner is rather clever and complicated. The Parser, on the contrary,
18 * is a straightforward implementation of a recursive-descendant parser (or,
19 * LL(1) parser, as it is usually called).
20 *
21 * Actually there are two issues of Scanning that might be called "clever", the
22 * rest is quite straightforward.  The issues are "block collection start" and
23 * "simple keys".  Both issues are explained below in details.
24 *
25 * Here the Scanning step is explained and implemented.  We start with the list
26 * of all the tokens produced by the Scanner together with short descriptions.
27 *
28 * Now, tokens:
29 *
30 *      STREAM-START(encoding)          # The stream start.
31 *      STREAM-END                      # The stream end.
32 *      VERSION-DIRECTIVE(major,minor)  # The '%YAML' directive.
33 *      TAG-DIRECTIVE(handle,prefix)    # The '%TAG' directive.
34 *      DOCUMENT-START                  # '---'
35 *      DOCUMENT-END                    # '...'
36 *      BLOCK-SEQUENCE-START            # Indentation increase denoting a block
37 *      BLOCK-MAPPING-START             # sequence or a block mapping.
38 *      BLOCK-END                       # Indentation decrease.
39 *      FLOW-SEQUENCE-START             # '['
40 *      FLOW-SEQUENCE-END               # ']'
41 *      BLOCK-SEQUENCE-START            # '{'
42 *      BLOCK-SEQUENCE-END              # '}'
43 *      BLOCK-ENTRY                     # '-'
44 *      FLOW-ENTRY                      # ','
45 *      KEY                             # '?' or nothing (simple keys).
46 *      VALUE                           # ':'
47 *      ALIAS(anchor)                   # '*anchor'
48 *      ANCHOR(anchor)                  # '&anchor'
49 *      TAG(handle,suffix)              # '!handle!suffix'
50 *      SCALAR(value,style)             # A scalar.
51 *
52 * The following two tokens are "virtual" tokens denoting the beginning and the
53 * end of the stream:
54 *
55 *      STREAM-START(encoding)
56 *      STREAM-END
57 *
58 * We pass the information about the input stream encoding with the
59 * STREAM-START token.
60 *
61 * The next two tokens are responsible for tags:
62 *
63 *      VERSION-DIRECTIVE(major,minor)
64 *      TAG-DIRECTIVE(handle,prefix)
65 *
66 * Example:
67 *
68 *      %YAML   1.1
69 *      %TAG    !   !foo
70 *      %TAG    !yaml!  tag:yaml.org,2002:
71 *      ---
72 *
73 * The correspoding sequence of tokens:
74 *
75 *      STREAM-START(utf-8)
76 *      VERSION-DIRECTIVE(1,1)
77 *      TAG-DIRECTIVE("!","!foo")
78 *      TAG-DIRECTIVE("!yaml","tag:yaml.org,2002:")
79 *      DOCUMENT-START
80 *      STREAM-END
81 *
82 * Note that the VERSION-DIRECTIVE and TAG-DIRECTIVE tokens occupy a whole
83 * line.
84 *
85 * The document start and end indicators are represented by:
86 *
87 *      DOCUMENT-START
88 *      DOCUMENT-END
89 *
90 * Note that if a YAML stream contains an implicit document (without '---'
91 * and '...' indicators), no DOCUMENT-START and DOCUMENT-END tokens will be
92 * produced.
93 *
94 * In the following examples, we present whole documents together with the
95 * produced tokens.
96 *
97 *      1. An implicit document:
98 *
99 *          'a scalar'
100 *
101 *      Tokens:
102 *
103 *          STREAM-START(utf-8)
104 *          SCALAR("a scalar",single-quoted)
105 *          STREAM-END
106 *
107 *      2. An explicit document:
108 *
109 *          ---
110 *          'a scalar'
111 *          ...
112 *
113 *      Tokens:
114 *
115 *          STREAM-START(utf-8)
116 *          DOCUMENT-START
117 *          SCALAR("a scalar",single-quoted)
118 *          DOCUMENT-END
119 *          STREAM-END
120 *
121 *      3. Several documents in a stream:
122 *
123 *          'a scalar'
124 *          ---
125 *          'another scalar'
126 *          ---
127 *          'yet another scalar'
128 *
129 *      Tokens:
130 *
131 *          STREAM-START(utf-8)
132 *          SCALAR("a scalar",single-quoted)
133 *          DOCUMENT-START
134 *          SCALAR("another scalar",single-quoted)
135 *          DOCUMENT-START
136 *          SCALAR("yet another scalar",single-quoted)
137 *          STREAM-END
138 *
139 * We have already introduced the SCALAR token above.  The following tokens are
140 * used to describe aliases, anchors, tag, and scalars:
141 *
142 *      ALIAS(anchor)
143 *      ANCHOR(anchor)
144 *      TAG(handle,suffix)
145 *      SCALAR(value,style)
146 *
147 * The following series of examples illustrate the usage of these tokens:
148 *
149 *      1. A recursive sequence:
150 *
151 *          &A [ *A ]
152 *
153 *      Tokens:
154 *
155 *          STREAM-START(utf-8)
156 *          ANCHOR("A")
157 *          FLOW-SEQUENCE-START
158 *          ALIAS("A")
159 *          FLOW-SEQUENCE-END
160 *          STREAM-END
161 *
162 *      2. A tagged scalar:
163 *
164 *          !!float "3.14"  # A good approximation.
165 *
166 *      Tokens:
167 *
168 *          STREAM-START(utf-8)
169 *          TAG("!!","float")
170 *          SCALAR("3.14",double-quoted)
171 *          STREAM-END
172 *
173 *      3. Various scalar styles:
174 *
175 *          --- # Implicit empty plain scalars do not produce tokens.
176 *          --- a plain scalar
177 *          --- 'a single-quoted scalar'
178 *          --- "a double-quoted scalar"
179 *          --- |-
180 *            a literal scalar
181 *          --- >-
182 *            a folded
183 *            scalar
184 *
185 *      Tokens:
186 *
187 *          STREAM-START(utf-8)
188 *          DOCUMENT-START
189 *          DOCUMENT-START
190 *          SCALAR("a plain scalar",plain)
191 *          DOCUMENT-START
192 *          SCALAR("a single-quoted scalar",single-quoted)
193 *          DOCUMENT-START
194 *          SCALAR("a double-quoted scalar",double-quoted)
195 *          DOCUMENT-START
196 *          SCALAR("a literal scalar",literal)
197 *          DOCUMENT-START
198 *          SCALAR("a folded scalar",folded)
199 *          STREAM-END
200 *
201 * Now it's time to review collection-related tokens. We will start with
202 * flow collections:
203 *
204 *      FLOW-SEQUENCE-START
205 *      FLOW-SEQUENCE-END
206 *      FLOW-MAPPING-START
207 *      FLOW-MAPPING-END
208 *      FLOW-ENTRY
209 *      KEY
210 *      VALUE
211 *
212 * The tokens FLOW-SEQUENCE-START, FLOW-SEQUENCE-END, FLOW-MAPPING-START, and
213 * FLOW-MAPPING-END represent the indicators '[', ']', '{', and '}'
214 * correspondingly.  FLOW-ENTRY represent the ',' indicator.  Finally the
215 * indicators '?' and ':', which are used for denoting mapping keys and values,
216 * are represented by the KEY and VALUE tokens.
217 *
218 * The following examples show flow collections:
219 *
220 *      1. A flow sequence:
221 *
222 *          [item 1, item 2, item 3]
223 *
224 *      Tokens:
225 *
226 *          STREAM-START(utf-8)
227 *          FLOW-SEQUENCE-START
228 *          SCALAR("item 1",plain)
229 *          FLOW-ENTRY
230 *          SCALAR("item 2",plain)
231 *          FLOW-ENTRY
232 *          SCALAR("item 3",plain)
233 *          FLOW-SEQUENCE-END
234 *          STREAM-END
235 *
236 *      2. A flow mapping:
237 *
238 *          {
239 *              a simple key: a value,  # Note that the KEY token is produced.
240 *              ? a complex key: another value,
241 *          }
242 *
243 *      Tokens:
244 *
245 *          STREAM-START(utf-8)
246 *          FLOW-MAPPING-START
247 *          KEY
248 *          SCALAR("a simple key",plain)
249 *          VALUE
250 *          SCALAR("a value",plain)
251 *          FLOW-ENTRY
252 *          KEY
253 *          SCALAR("a complex key",plain)
254 *          VALUE
255 *          SCALAR("another value",plain)
256 *          FLOW-ENTRY
257 *          FLOW-MAPPING-END
258 *          STREAM-END
259 *
260 * A simple key is a key which is not denoted by the '?' indicator.  Note that
261 * the Scanner still produce the KEY token whenever it encounters a simple key.
262 *
263 * For scanning block collections, the following tokens are used (note that we
264 * repeat KEY and VALUE here):
265 *
266 *      BLOCK-SEQUENCE-START
267 *      BLOCK-MAPPING-START
268 *      BLOCK-END
269 *      BLOCK-ENTRY
270 *      KEY
271 *      VALUE
272 *
273 * The tokens BLOCK-SEQUENCE-START and BLOCK-MAPPING-START denote indentation
274 * increase that precedes a block collection (cf. the INDENT token in Python).
275 * The token BLOCK-END denote indentation decrease that ends a block collection
276 * (cf. the DEDENT token in Python).  However YAML has some syntax pecularities
277 * that makes detections of these tokens more complex.
278 *
279 * The tokens BLOCK-ENTRY, KEY, and VALUE are used to represent the indicators
280 * '-', '?', and ':' correspondingly.
281 *
282 * The following examples show how the tokens BLOCK-SEQUENCE-START,
283 * BLOCK-MAPPING-START, and BLOCK-END are emitted by the Scanner:
284 *
285 *      1. Block sequences:
286 *
287 *          - item 1
288 *          - item 2
289 *          -
290 *            - item 3.1
291 *            - item 3.2
292 *          -
293 *            key 1: value 1
294 *            key 2: value 2
295 *
296 *      Tokens:
297 *
298 *          STREAM-START(utf-8)
299 *          BLOCK-SEQUENCE-START
300 *          BLOCK-ENTRY
301 *          SCALAR("item 1",plain)
302 *          BLOCK-ENTRY
303 *          SCALAR("item 2",plain)
304 *          BLOCK-ENTRY
305 *          BLOCK-SEQUENCE-START
306 *          BLOCK-ENTRY
307 *          SCALAR("item 3.1",plain)
308 *          BLOCK-ENTRY
309 *          SCALAR("item 3.2",plain)
310 *          BLOCK-END
311 *          BLOCK-ENTRY
312 *          BLOCK-MAPPING-START
313 *          KEY
314 *          SCALAR("key 1",plain)
315 *          VALUE
316 *          SCALAR("value 1",plain)
317 *          KEY
318 *          SCALAR("key 2",plain)
319 *          VALUE
320 *          SCALAR("value 2",plain)
321 *          BLOCK-END
322 *          BLOCK-END
323 *          STREAM-END
324 *
325 *      2. Block mappings:
326 *
327 *          a simple key: a value   # The KEY token is produced here.
328 *          ? a complex key
329 *          : another value
330 *          a mapping:
331 *            key 1: value 1
332 *            key 2: value 2
333 *          a sequence:
334 *            - item 1
335 *            - item 2
336 *
337 *      Tokens:
338 *
339 *          STREAM-START(utf-8)
340 *          BLOCK-MAPPING-START
341 *          KEY
342 *          SCALAR("a simple key",plain)
343 *          VALUE
344 *          SCALAR("a value",plain)
345 *          KEY
346 *          SCALAR("a complex key",plain)
347 *          VALUE
348 *          SCALAR("another value",plain)
349 *          KEY
350 *          SCALAR("a mapping",plain)
351 *          BLOCK-MAPPING-START
352 *          KEY
353 *          SCALAR("key 1",plain)
354 *          VALUE
355 *          SCALAR("value 1",plain)
356 *          KEY
357 *          SCALAR("key 2",plain)
358 *          VALUE
359 *          SCALAR("value 2",plain)
360 *          BLOCK-END
361 *          KEY
362 *          SCALAR("a sequence",plain)
363 *          VALUE
364 *          BLOCK-SEQUENCE-START
365 *          BLOCK-ENTRY
366 *          SCALAR("item 1",plain)
367 *          BLOCK-ENTRY
368 *          SCALAR("item 2",plain)
369 *          BLOCK-END
370 *          BLOCK-END
371 *          STREAM-END
372 *
373 * YAML does not always require to start a new block collection from a new
374 * line.  If the current line contains only '-', '?', and ':' indicators, a new
375 * block collection may start at the current line.  The following examples
376 * illustrate this case:
377 *
378 *      1. Collections in a sequence:
379 *
380 *          - - item 1
381 *            - item 2
382 *          - key 1: value 1
383 *            key 2: value 2
384 *          - ? complex key
385 *            : complex value
386 *
387 *      Tokens:
388 *
389 *          STREAM-START(utf-8)
390 *          BLOCK-SEQUENCE-START
391 *          BLOCK-ENTRY
392 *          BLOCK-SEQUENCE-START
393 *          BLOCK-ENTRY
394 *          SCALAR("item 1",plain)
395 *          BLOCK-ENTRY
396 *          SCALAR("item 2",plain)
397 *          BLOCK-END
398 *          BLOCK-ENTRY
399 *          BLOCK-MAPPING-START
400 *          KEY
401 *          SCALAR("key 1",plain)
402 *          VALUE
403 *          SCALAR("value 1",plain)
404 *          KEY
405 *          SCALAR("key 2",plain)
406 *          VALUE
407 *          SCALAR("value 2",plain)
408 *          BLOCK-END
409 *          BLOCK-ENTRY
410 *          BLOCK-MAPPING-START
411 *          KEY
412 *          SCALAR("complex key")
413 *          VALUE
414 *          SCALAR("complex value")
415 *          BLOCK-END
416 *          BLOCK-END
417 *          STREAM-END
418 *
419 *      2. Collections in a mapping:
420 *
421 *          ? a sequence
422 *          : - item 1
423 *            - item 2
424 *          ? a mapping
425 *          : key 1: value 1
426 *            key 2: value 2
427 *
428 *      Tokens:
429 *
430 *          STREAM-START(utf-8)
431 *          BLOCK-MAPPING-START
432 *          KEY
433 *          SCALAR("a sequence",plain)
434 *          VALUE
435 *          BLOCK-SEQUENCE-START
436 *          BLOCK-ENTRY
437 *          SCALAR("item 1",plain)
438 *          BLOCK-ENTRY
439 *          SCALAR("item 2",plain)
440 *          BLOCK-END
441 *          KEY
442 *          SCALAR("a mapping",plain)
443 *          VALUE
444 *          BLOCK-MAPPING-START
445 *          KEY
446 *          SCALAR("key 1",plain)
447 *          VALUE
448 *          SCALAR("value 1",plain)
449 *          KEY
450 *          SCALAR("key 2",plain)
451 *          VALUE
452 *          SCALAR("value 2",plain)
453 *          BLOCK-END
454 *          BLOCK-END
455 *          STREAM-END
456 *
457 * YAML also permits non-indented sequences if they are included into a block
458 * mapping.  In this case, the token BLOCK-SEQUENCE-START is not produced:
459 *
460 *      key:
461 *      - item 1    # BLOCK-SEQUENCE-START is NOT produced here.
462 *      - item 2
463 *
464 * Tokens:
465 *
466 *      STREAM-START(utf-8)
467 *      BLOCK-MAPPING-START
468 *      KEY
469 *      SCALAR("key",plain)
470 *      VALUE
471 *      BLOCK-ENTRY
472 *      SCALAR("item 1",plain)
473 *      BLOCK-ENTRY
474 *      SCALAR("item 2",plain)
475 *      BLOCK-END
476 */
477
478#if HAVE_CONFIG_H
479#include <config.h>
480#endif
481
482#include <yaml/yaml.h>
483
484#include <assert.h>
485
486/*
487 * Public API declarations.
488 */
489
490YAML_DECLARE(yaml_token_t *)
491yaml_parser_get_token(yaml_parser_t *parser);
492
493YAML_DECLARE(yaml_token_t *)
494yaml_parser_peek_token(yaml_parser_t *parser);
495
496/*
497 * High-level token API.
498 */
499
500static int
501yaml_parser_fetch_more_tokens(yaml_parser_t *parser);
502
503static int
504yaml_parser_fetch_next_token(yaml_parser_t *parser);
505
506/*
507 * Potential simple keys.
508 */
509
510static int
511yaml_parser_stale_simple_keys(yaml_parser_t *parser);
512
513static int
514yaml_parser_save_simple_key(yaml_parser_t *parser);
515
516static int
517yaml_parser_remove_simple_key(yaml_parser_t *parser);
518
519/*
520 * Indentation treatment.
521 */
522
523static int
524yaml_parser_add_indent(yaml_parser_t *parser);
525
526static int
527yaml_parser_remove_indent(yaml_parser_t *parser);
528
529/*
530 * Token fetchers.
531 */
532
533static int
534yaml_parser_fetch_stream_start(yaml_parser_t *parser);
535
536static int
537yaml_parser_fetch_stream_end(yaml_parser_t *parser);
538
539static int
540yaml_parser_fetch_directive(yaml_parser_t *parser);
541
542static int
543yaml_parser_fetch_document_start(yaml_parser_t *parser);
544
545static int
546yaml_parser_fetch_document_end(yaml_parser_t *parser);
547
548static int
549yaml_parser_fetch_document_indicator(yaml_parser_t *parser,
550        yaml_token_type_t type);
551
552static int
553yaml_parser_fetch_flow_sequence_start(yaml_parser_t *parser);
554
555static int
556yaml_parser_fetch_flow_mapping_start(yaml_parser_t *parser);
557
558static int
559yaml_parser_fetch_flow_collection_start(yaml_parser_t *parser,
560        yaml_token_type_t type);
561
562static int
563yaml_parser_fetch_flow_sequence_end(yaml_parser_t *parser);
564
565static int
566yaml_parser_fetch_flow_mapping_end(yaml_parser_t *parser);
567
568static int
569yaml_parser_fetch_flow_collection_end(yaml_parser_t *parser,
570        yaml_token_type_t type);
571
572static int
573yaml_parser_fetch_flow_entry(yaml_parser_t *parser);
574
575static int
576yaml_parser_fetch_block_entry(yaml_parser_t *parser);
577
578static int
579yaml_parser_fetch_key(yaml_parser_t *parser);
580
581static int
582yaml_parser_fetch_value(yaml_parser_t *parser);
583
584static int
585yaml_parser_fetch_alias(yaml_parser_t *parser);
586
587static int
588yaml_parser_fetch_anchor(yaml_parser_t *parser);
589
590static int
591yaml_parser_fetch_tag(yaml_parser_t *parser);
592
593static int
594yaml_parser_fetch_literal_scalar(yaml_parser_t *parser);
595
596static int
597yaml_parser_fetch_folded_scalar(yaml_parser_t *parser);
598
599static int
600yaml_parser_fetch_block_scalar(yaml_parser_t *parser, int literal);
601
602static int
603yaml_parser_fetch_single_quoted_scalar(yaml_parser_t *parser);
604
605static int
606yaml_parser_fetch_double_quoted_scalar(yaml_parser_t *parser);
607
608static int
609yaml_parser_fetch_flow_scalar(yaml_parser_t *parser, int single);
610
611static int
612yaml_parser_fetch_plain_scalar(yaml_parser_t *parser);
613
614/*
615 * Token scanners.
616 */
617
618static int
619yaml_parser_scan_to_next_token(yaml_parser_t *parser);
620
621static yaml_token_t *
622yaml_parser_scan_directive(yaml_parser_t *parser);
623
624static int
625yaml_parser_scan_directive_name(yaml_parser_t *parser,
626        yaml_mark_t start_mark, yaml_char_t **name);
627
628static int
629yaml_parser_scan_yaml_directive_value(yaml_parser_t *parser,
630        yaml_mark_t start_mark, int *major, int *minor);
631
632static int
633yaml_parser_scan_yaml_directive_number(yaml_parser_t *parser,
634        yaml_mark_t start_mark, int *number);
635
636static int
637yaml_parser_scan_tag_directive_value(yaml_parser_t *parser,
638        yaml_char_t **handle, yaml_char_t **prefix);
639
640static yaml_token_t *
641yaml_parser_scan_anchor(yaml_parser_t *parser,
642        yaml_token_type_t type);
643
644static yaml_token_t *
645yaml_parser_scan_tag(yaml_parser_t *parser);
646
647static int
648yaml_parser_scan_tag_handle(yaml_parser_t *parser, int directive,
649        yaml_mark_t start_mark, yaml_char_t **handle);
650
651static int
652yaml_parser_scan_tag_uri(yaml_parser_t *parser, int directive,
653        yaml_mark_t start_mark, yaml_char_t **url);
654
655static yaml_token_t *
656yaml_parser_scan_block_scalar(yaml_parser_t *parser, int literal);
657
658static int
659yaml_parser_scan_block_scalar_indicators(yaml_parser_t *parser,
660        yaml_mark_t start_mark, int *chomping, int *increment);
661
662static yaml_token_t *
663yaml_parser_scan_flow_scalar(yaml_parser_t *parser, int single);
664
665static yaml_token_t *
666yaml_parser_scan_plain_scalar(yaml_parser_t *parser);
667
Note: See TracBrowser for help on using the repository browser.