source: branches/pyyaml3000/lib/yaml/scanner.py @ 46

Revision 46, 24.7 KB checked in by xi, 9 years ago (diff)

Some renaming.

Line 
1
2# Tokens:
3# YAML-DIRECTIVE(major_version, minor_version), TAG-DIRECTIVE(handle, prefix)
4# RESERVED-DIRECTIVE(name)
5# DOCUMENT-START, DOCUMENT-END
6# BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END
7# FLOW-SEQUENCE-START, FLOW-MAPPING-START, FLOW-SEQUENCE-END, FLOW-MAPPING-END
8# ENTRY, KEY, VALUE
9# ALIAS(name), ANCHOR(name), TAG(value), SCALAR(value, plain)
10
11__all__ = ['Scanner', 'ScannerError']
12
13from error import YAMLError
14from tokens import *
15
16class ScannerError(YAMLError):
17    # TODO:
18    # ScannerError: while reading a quoted string
19    #         in '...', line 5, column 10:
20    # key: "valu\?e"
21    #      ^
22    # got unknown quote character '?'
23    #         in '...', line 5, column 15:
24    # key: "valu\?e"
25    #            ^
26    pass
27
28class SimpleKey:
29    def __init__(self, token_number, required, index, line, column, marker):
30        self.token_number = token_number
31        self.required = required
32        self.index = index
33        self.line = line
34        self.column = column
35        self.marker = marker
36
37class Scanner:
38
39
40    def __init__(self, reader):
41        """Initialize the scanner."""
42        # The input stream. The Reader class do the dirty work of checking for
43        # BOM and converting the input data to Unicode. It also adds NUL to
44        # the end.
45        #
46        # Reader supports the following methods
47        #   self.reader.peek(k=1)   # peek the next k characters
48        #   self.reader.forward(k=1)   # read the next k characters and move the
49        #                           # pointer
50        self.reader = reader
51
52        # Had we reached the end of the stream?
53        self.done = False
54
55        # The number of unclosed '{' and '['. `flow_level == 0` means block
56        # context.
57        self.flow_level = 0
58
59        # List of processed tokens that are not yet emitted.
60        self.tokens = []
61
62        # Number of tokens that were emitted through the `get_token` method.
63        self.tokens_taken = 0
64
65        # The current indentation level.
66        self.indent = -1
67
68        # Past indentation levels.
69        self.indents = []
70
71        # Variables related to simple keys treatment.
72
73        # A simple key is a key that is not denoted by the '?' indicator.
74        # Example of simple keys:
75        #   ---
76        #   block simple key: value
77        #   ? not a simple key:
78        #   : { flow simple key: value }
79        # We emit the KEY token before all keys, so when we find a potential
80        # simple key, we try to locate the corresponding ':' indicator.
81        # Simple keys should be limited to a single line and 1024 characters.
82
83        # Can a simple key start at the current position? A simple key may
84        # start:
85        # - at the beginning of the line, not counting indentation spaces
86        #       (in block context),
87        # - after '{', '[', ',' (in the flow context),
88        # - after '?', ':', '-' (in the block context).
89        # In the block context, this flag also signify if a block collection
90        # may start at the current position.
91        self.allow_simple_key = True
92
93        # Keep track of possible simple keys. This is a dictionary. The key
94        # is `flow_level`; there can be no more that one possible simple key
95        # for each level. The value is a SimpleKey record:
96        #   (token_number, required, index, line, column, marker)
97        # A simple key may start with ALIAS, ANCHOR, TAG, SCALAR(flow),
98        # '[', or '{' tokens.
99        self.possible_simple_keys = {}
100
101    # Two public methods.
102
103    def peek_token(self):
104        """Get the current token."""
105        while self.need_more_tokens():
106            self.fetch_more_tokens()
107        if self.tokens:
108            return self.tokens[0]
109
110    def get_token(self):
111        "Get the current token and remove it from the list of pending tokens."""
112        while self.need_more_tokens():
113            self.fetch_more_tokens()
114        if self.tokens:
115            self.tokens_taken += 1
116            return self.tokens.pop(0)
117
118    # Private methods.
119
120    def need_more_tokens(self):
121        if self.done:
122            return False
123        if not self.tokens:
124            return True
125        # The current token may be a potential simple key, so we
126        # need to look further.
127        self.stale_possible_simple_keys()
128        if self.next_possible_simple_key() == self.tokens_taken:
129            return True
130
131    def fetch_more_tokens(self):
132
133        # Eat whitespaces and comments until we reach the next token.
134        self.scan_to_next_token()
135
136        # Remove obsolete possible simple keys.
137        self.stale_possible_simple_keys()
138
139        # Compare the current indentation and column. It may add some tokens
140        # and decrease the current indentation level.
141        self.unwind_indent(self.reader.column)
142
143        #print
144        #print self.reader.get_marker().get_snippet()
145
146        # Peek the next character.
147        ch = self.reader.peek()
148
149        # Is it the end of reader?
150        if ch == u'\0':
151            return self.fetch_end()
152
153        # Is it a directive?
154        if ch == u'%' and self.check_directive():
155            return self.fetch_directive()
156
157        # Is it the document start?
158        if ch == u'-' and self.check_document_start():
159            return self.fetch_document_start()
160
161        # Is it the document end?
162        if ch == u'.' and self.check_document_end():
163            return self.fetch_document_end()
164
165        # Note: the order of the following checks is NOT significant.
166
167        # Is it the flow sequence start indicator?
168        if ch == u'[':
169            return self.fetch_flow_sequence_start()
170
171        # Is it the flow mapping start indicator?
172        if ch == u'{':
173            return self.fetch_flow_mapping_start()
174
175        # Is it the flow sequence end indicator?
176        if ch == u']':
177            return self.fetch_flow_sequence_end()
178
179        # Is it the flow mapping end indicator?
180        if ch == u'}':
181            return self.fetch_flow_mapping_end()
182
183        # Is it the entry indicator?
184        if ch in u'-,' and self.check_entry():
185            return self.fetch_entry()
186
187        # Is it the key indicator?
188        if ch == u'?' and self.check_key():
189            return self.fetch_key()
190
191        # Is it the value indicator?
192        if ch == u':' and self.check_value():
193            return self.fetch_value()
194
195        # Is it an alias?
196        if ch == u'*':
197            return self.fetch_alias()
198
199        # Is it an anchor?
200        if ch == u'&':
201            return self.fetch_anchor()
202
203        # Is it a tag?
204        if ch == u'!':
205            return self.fetch_tag()
206
207        # Is it a literal scalar?
208        if ch == u'|' and not self.flow_level:
209            return self.fetch_literal()
210
211        # Is it a folded scalar?
212        if ch == u'>' and not self.flow_level:
213            return self.fetch_folded()
214
215        # Is it a single quoted scalar?
216        if ch == u'\'':
217            return self.fetch_single()
218
219        # Is it a double quoted scalar?
220        if ch == u'\"':
221            return self.fetch_double()
222
223        # It must be a plain scalar then.
224        if self.check_plain():
225            return self.fetch_plain()
226
227        # No? It's an error. Let's produce a nice error message.
228        self.invalid_token()
229
230    # Simple keys treatment.
231
232    def next_possible_simple_key(self):
233        # Return the number of the nearest possible simple key. Actually we
234        # don't need to loop through the whole dictionary. We may replace it
235        # with the following code:
236        #   if not self.possible_simple_keys:
237        #       return None
238        #   return self.possible_simple_keys[
239        #           min(self.possible_simple_keys.keys())].token_number
240        min_token_number = None
241        for level in self.possible_simple_keys:
242            key = self.possible_simple_keys[level]
243            if min_token_number is None or key.token_number < min_token_number:
244                min_token_number = key.token_number
245        return min_token_number
246
247    def stale_possible_simple_keys(self):
248        # Remove entries that are no longer possible simple keys. According to
249        # the YAML specification, simple keys
250        # - should be limited to a single line,
251        # - should be no longer than 1024 characters.
252        # Disabling this procedure will allow simple keys of any length and
253        # height (may cause problems if indentation is broken though).
254        for level in self.possible_simple_keys.keys():
255            key = self.possible_simple_keys[level]
256            if key.line != self.reader.line  \
257                    or self.reader.index-key.index > 1024:
258                if key.required:
259                    self.fail("simple key is required")
260                del self.possible_simple_keys[level]
261
262    def save_possible_simple_key(self):
263        # The next token may start a simple key. We check if it's possible
264        # and save its position. This function is called for
265        #   ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'.
266
267        # Check if a simple key is required at the current position.
268        required = not self.flow_level and self.indent == self.reader.column
269
270        # The next token might be a simple key. Let's save it's number and
271        # position.
272        if self.allow_simple_key:
273            self.remove_possible_simple_key()
274            token_number = self.tokens_taken+len(self.tokens)
275            index = self.reader.index
276            line = self.reader.line
277            column = self.reader.column
278            marker = self.reader.get_marker()
279            key = SimpleKey(token_number, required,
280                    index, line, column, marker)
281            self.possible_simple_keys[self.flow_level] = key
282
283        # A simple key is required at the current position.
284        elif required:
285            self.fail("simple key is required")
286
287    def remove_possible_simple_key(self):
288        # Remove the saved possible key position at the current flow level.
289        if self.flow_level in self.possible_simple_keys:
290            key = self.possible_simple_keys[self.flow_level]
291            if key.required:
292                self.fail("simple key is required")
293
294    # Indentation functions.
295
296    def unwind_indent(self, column):
297
298        # In flow context, tokens should respect indentation.
299        if self.flow_level and self.indent > column:
300            self.fail("invalid intendation in the flow context")
301
302        # In block context, we may need to issue the BLOCK-END tokens.
303        while self.indent > column:
304            marker = self.reader.get_marker()
305            self.indent = self.indents.pop()
306            self.tokens.append(BlockEndToken(marker, marker))
307
308    def add_indent(self, column):
309        # Check if we need to increase indentation.
310        if self.indent < column:
311            self.indents.append(self.indent)
312            self.indent = column
313            return True
314        return False
315
316    # Fetchers.
317
318    def fetch_end(self):
319
320        # Set the current intendation to -1.
321        self.unwind_indent(-1)
322
323        # Reset everything (not really needed).
324        self.allow_simple_key = False
325        self.possible_simple_keys = {}
326
327        # Read the token.
328        marker = self.reader.get_marker()
329       
330        # Add END.
331        self.tokens.append(EndToken(marker, marker))
332
333        # The reader is ended.
334        self.done = True
335
336    def fetch_directive(self):
337       
338        # Set the current intendation to -1.
339        self.unwind_indent(-1)
340
341        # Reset simple keys.
342        self.remove_possible_simple_key()
343        self.allow_simple_key = False
344
345        # Scan and add DIRECTIVE.
346        self.scan_directive()
347
348    def fetch_document_start(self):
349        self.fetch_document_indicator(DocumentStartToken)
350
351    def fetch_document_end(self):
352        self.fetch_document_indicator(DocumentEndToken)
353
354    def fetch_document_indicator(self, TokenClass):
355
356        # Set the current intendation to -1.
357        self.unwind_indent(-1)
358
359        # Reset simple keys. Note that there could not be a block collection
360        # after '---'.
361        self.remove_possible_simple_key()
362        self.allow_simple_key = False
363
364        # Add DOCUMENT-START or DOCUMENT-END.
365        start_marker = self.reader.get_marker()
366        self.reader.forward(3)
367        end_marker = self.reader.get_marker()
368        self.tokens.append(TokenClass(start_marker, end_marker))
369
370    def fetch_flow_sequence_start(self):
371        self.fetch_flow_collection_start(FlowSequenceStartToken)
372
373    def fetch_flow_mapping_start(self):
374        self.fetch_flow_collection_start(FlowMappingStartToken)
375
376    def fetch_flow_collection_start(self, TokenClass):
377
378        # '[' and '{' may start a simple key.
379        self.save_possible_simple_key()
380
381        # Increase the flow level.
382        self.flow_level += 1
383
384        # Simple keys are allowed after '[' and '{'.
385        self.allow_simple_key = True
386
387        # Add FLOW-SEQUENCE-START or FLOW-MAPPING-START.
388        start_marker = self.reader.get_marker()
389        self.reader.forward()
390        end_marker = self.reader.get_marker()
391        self.tokens.append(TokenClass(start_marker, end_marker))
392
393    def fetch_flow_sequence_end(self):
394        self.fetch_flow_collection_end(FlowSequenceEndToken)
395
396    def fetch_flow_mapping_end(self):
397        self.fetch_flow_collection_end(FlowMappingEndToken)
398
399    def fetch_flow_collection_end(self, TokenClass):
400
401        # Reset possible simple key on the current level.
402        self.remove_possible_simple_key()
403
404        # Decrease the flow level.
405        self.flow_level -= 1
406
407        # No simple keys after ']' or '}'.
408        self.allow_simple_key = False
409
410        # Add FLOW-SEQUENCE-END or FLOW-MAPPING-END.
411        start_marker = self.reader.get_marker()
412        self.reader.forward()
413        end_marker = self.reader.get_marker()
414        self.tokens.append(TokenClass(start_marker, end_marker))
415
416    def fetch_entry(self):
417
418        # Block context needs additional checks.
419        if not self.flow_level:
420
421            # Are we allowed to start a new entry?
422            if not self.allow_simple_key:
423                self.fail("Cannot start a new entry here")
424
425            # We may need to add BLOCK-SEQUENCE-START.
426            if self.add_indent(self.reader.column):
427                marker = self.reader.get_marker()
428                self.tokens.append(BlockSequenceStartToken(marker, marker))
429
430        # Simple keys are allowed after '-' and ','.
431        self.allow_simple_key = True
432
433        # Reset possible simple key on the current level.
434        self.remove_possible_simple_key()
435
436        # Add ENTRY.
437        start_marker = self.reader.get_marker()
438        self.reader.forward()
439        end_marker = self.reader.get_marker()
440        self.tokens.append(EntryToken(start_marker, end_marker))
441
442    def fetch_key(self):
443       
444        # Block context needs additional checks.
445        if not self.flow_level:
446
447            # Are we allowed to start a key (not nessesary a simple)?
448            if not self.allow_simple_key:
449                self.fail("Cannot start a new key here")
450
451            # We may need to add BLOCK-MAPPING-START.
452            if self.add_indent(self.reader.column):
453                marker = self.reader.get_marker()
454                self.tokens.append(BlockMappingStartToken(marker, marker))
455
456        # Simple keys are allowed after '?' in the block context.
457        self.allow_simple_key = not self.flow_level
458
459        # Reset possible simple key on the current level.
460        self.remove_possible_simple_key()
461
462        # Add KEY.
463        start_marker = self.reader.get_marker()
464        self.reader.forward()
465        end_marker = self.reader.get_marker()
466        self.tokens.append(KeyToken(start_marker, end_marker))
467
468    def fetch_value(self):
469
470        # Do we determine a simple key?
471        if self.flow_level in self.possible_simple_keys:
472
473            # Add KEY.
474            key = self.possible_simple_keys[self.flow_level]
475            del self.possible_simple_keys[self.flow_level]
476            self.tokens.insert(key.token_number-self.tokens_taken,
477                    KeyToken(key.marker, key.marker))
478
479            # If this key starts a new block mapping, we need to add
480            # BLOCK-MAPPING-START.
481            if not self.flow_level:
482                if self.add_indent(key.column):
483                    self.tokens.insert(key.token_number-self.tokens_taken,
484                            BlockMappingStartToken(key.marker, key.marker))
485
486            # There cannot be two simple keys one after another.
487            self.allow_simple_key = False
488
489        # It must be a part of a complex key.
490        else:
491           
492            # Simple keys are allowed after ':' in the block context.
493            self.allow_simple_key = not self.flow_level
494
495            # Reset possible simple key on the current level.
496            self.remove_possible_simple_key()
497
498        # Add VALUE.
499        start_marker = self.reader.get_marker()
500        self.reader.forward()
501        end_marker = self.reader.get_marker()
502        self.tokens.append(ValueToken(start_marker, end_marker))
503
504    def fetch_alias(self):
505
506        # ALIAS could be a simple key.
507        self.save_possible_simple_key()
508
509        # No simple keys after ALIAS.
510        self.allow_simple_key = False
511
512        # Scan and add ALIAS.
513        self.scan_anchor(AliasToken)
514
515    def fetch_anchor(self):
516
517        # ANCHOR could start a simple key.
518        self.save_possible_simple_key()
519
520        # No simple keys after ANCHOR.
521        self.allow_simple_key = False
522
523        # Scan and add ANCHOR.
524        self.scan_anchor(AnchorToken)
525
526    def fetch_tag(self):
527
528        # TAG could start a simple key.
529        self.save_possible_simple_key()
530
531        # No simple keys after TAG.
532        self.allow_simple_key = False
533
534        # Scan and add TAG.
535        self.scan_tag()
536
537    def fetch_literal(self):
538        self.fetch_block_scalar(folded=False)
539
540    def fetch_folded(self):
541        self.fetch_block_scalar(folded=True)
542
543    def fetch_block_scalar(self, folded):
544
545        # A simple key may follow a block scalar.
546        self.allow_simple_key = True
547
548        # Reset possible simple key on the current level.
549        self.remove_possible_simple_key()
550
551        # Scan and add SCALAR.
552        self.scan_block_scalar(folded)
553
554    def fetch_single(self):
555        self.fetch_flow_scalar(double=False)
556
557    def fetch_double(self):
558        self.fetch_flow_scalar(double=True)
559
560    def fetch_flow_scalar(self, double):
561
562        # A flow scalar could be a simple key.
563        self.save_possible_simple_key()
564
565        # No simple keys after flow scalars.
566        self.allow_simple_key = False
567
568        # Scan and add SCALAR.
569        self.scan_flow_scalar(double)
570
571    def fetch_plain(self):
572
573        # A plain scalar could be a simple key.
574        self.save_possible_simple_key()
575
576        # No simple keys after plain scalars. But note that `scan_plain` will
577        # change this flag if the scan is finished at the beginning of the
578        # line.
579        self.allow_simple_key = False
580
581        # Scan and add SCALAR. May change `allow_simple_key`.
582        self.scan_plain()
583
584    # Checkers.
585
586    def check_directive(self):
587
588        # DIRECTIVE:        ^ '%' ...
589        # The '%' indicator is already checked.
590        if self.reader.column == 0:
591            return True
592
593    def check_document_start(self):
594
595        # DOCUMENT-START:   ^ '---' (' '|'\n')
596        if self.reader.column == 0:
597            prefix = self.reader.peek(4)
598            if prefix[:3] == u'---' and prefix[3] in u'\0 \t\r\n\x85\u2028\u2029':
599                return True
600
601    def check_document_end(self):
602
603        # DOCUMENT-END:     ^ '...' (' '|'\n')
604        if self.reader.column == 0:
605            prefix = self.reader.peek(4)
606            if prefix[:3] == u'...' and prefix[3] in u'\0 \t\r\n\x85\u2028\u2029':
607                return True
608
609    def check_entry(self):
610
611        # ENTRY(flow context):      ','
612        if self.flow_level:
613            return self.reader.peek() == u','
614
615        # ENTRY(block context):     '-' (' '|'\n')
616        else:
617            prefix = self.reader.peek(2)
618            return prefix[0] == u'-' and prefix[1] in u'\0 \t\r\n\x85\u2028\u2029'
619
620    def check_key(self):
621
622        # KEY(flow context):    '?'
623        if self.flow_level:
624            return True
625
626        # KEY(block context):   '?' (' '|'\n')
627        else:
628            prefix = self.reader.peek(2)
629            return prefix[1] in u'\0 \t\r\n\x85\u2028\u2029'
630
631    def check_value(self):
632
633        # VALUE(flow context):  ':'
634        if self.flow_level:
635            return True
636
637        # VALUE(block context): ':' (' '|'\n')
638        else:
639            prefix = self.reader.peek(2)
640            return prefix[1] in u'\0 \t\r\n\x85\u2028\u2029'
641
642    def check_plain(self):
643        return True
644
645    # Scanners.
646
647    def scan_to_next_token(self):
648        found = False
649        while not found:
650            while self.reader.peek() == u' ':
651                self.reader.forward()
652            if self.reader.peek() == u'#':
653                while self.reader.peek() not in u'\r\n':
654                    self.reader.forward()
655            if self.reader.peek() in u'\r\n':
656                self.reader.forward()
657                if not self.flow_level:
658                    self.allow_simple_key = True
659            else:
660                found = True
661
662    def scan_directive(self):
663        marker = self.reader.get_marker()
664        if self.reader.peek(5) == u'%YAML ':
665            self.tokens.append(YAMLDirectiveToken(1, 1, marker, marker))
666        elif self.reader.peek(4) == u'%TAG ':
667            self.tokens.append(TagDirectiveToken(marker, marker))
668        else:
669            self.tokens.append(ReservedDirectiveToken('', marker, marker))
670        while self.reader.peek() not in u'\0\r\n':
671            self.reader.forward()
672        self.reader.forward()
673
674    def scan_anchor(self, TokenClass):
675        start_marker = self.reader.get_marker()
676        while self.reader.peek() not in u'\0 \t\r\n,:':
677            self.reader.forward()
678        end_marker = self.reader.get_marker()
679        self.tokens.append(TokenClass('', start_marker, end_marker))
680
681    def scan_tag(self):
682        start_marker = self.reader.get_marker()
683        while self.reader.peek() not in u'\0 \t\r\n':
684            self.reader.forward()
685        end_marker = self.reader.get_marker()
686        self.tokens.append(TagToken('', start_marker, end_marker))
687
688    def scan_block_scalar(self, folded):
689        start_marker = self.reader.get_marker()
690        indent = self.indent+1
691        if indent < 1:
692            indent = 1
693        while True:
694            while self.reader.peek() and self.reader.peek() and self.reader.peek() not in u'\0\r\n\x85\u2028\u2029':
695                self.reader.forward()
696            if self.reader.peek() != u'\0':
697                self.reader.forward()
698            count = 0
699            while count < indent and self.reader.peek() == u' ':
700                self.reader.forward()
701                count += 1
702            if count < indent and self.reader.peek() not in u'#\r\n\x85\u2028\u2029':
703                break
704        self.tokens.append(ScalarToken('', False, start_marker, start_marker))
705
706    def scan_flow_scalar(self, double):
707        marker = self.reader.get_marker()
708        quote = self.reader.peek()
709        self.reader.forward()
710        while self.reader.peek() != quote:
711            if double and self.reader.peek() == u'\\':
712                self.reader.forward(2)
713            elif not double and self.reader.peek(3)[1:] == u'\'\'':
714                self.reader.forward(3)
715            else:
716                self.reader.forward(1)
717        self.reader.forward(1)
718        self.tokens.append(ScalarToken('', False, marker, marker))
719
720    def scan_plain(self):
721        indent = self.indent+1
722        if indent < 1:
723            indent = 1
724        space = False
725        marker = self.reader.get_marker()
726        while True:
727            while self.reader.peek() == u' ':
728                self.reader.forward()
729                space = True
730            while self.reader.peek() not in u'\0\r\n?:,[]{}#'   \
731                    or (not space and self.reader.peek() == '#')    \
732                    or (not self.flow_level and self.reader.peek() in '?,[]{}') \
733                    or (not self.flow_level and self.reader.peek() == ':' and self.reader.peek(2)[1] not in u' \0\r\n'):
734                space = self.reader.peek() not in u' \t'
735                self.reader.forward()
736                self.allow_simple_key = False
737            if self.reader.peek() not in u'\r\n':
738                break
739            while self.reader.peek() in u'\r\n':
740                self.reader.forward()
741                if not self.flow_level:
742                    self.allow_simple_key = True
743            count = 0
744            while self.reader.peek() == u' ' and count < indent:
745                self.reader.forward()
746                count += 1
747            if count < indent:
748                break
749            space = True
750        self.tokens.append(ScalarToken('', True, marker, marker))
751
752    def invalid_token(self):
753        self.fail("invalid token")
754
755    def fail(self, message):
756        raise ScannerError(message)
757
758#try:
759#    import psyco
760#    psyco.bind(Scanner)
761#except ImportError:
762#    pass
763
Note: See TracBrowser for help on using the repository browser.