source: pyyaml/branches/working-on-emitter/lib/yaml/scanner.py @ 127

Revision 127, 53.4 KB checked in by xi, 8 years ago (diff)

Parser now provides style information. Allow empty plain scalars if a tag or anchor is given.

RevLine 
[39]1
[55]2# Scanner produces tokens of the following types:
[118]3# STREAM-START
4# STREAM-END
[55]5# DIRECTIVE(name, value)
6# DOCUMENT-START
7# DOCUMENT-END
8# BLOCK-SEQUENCE-START
9# BLOCK-MAPPING-START
10# BLOCK-END
11# FLOW-SEQUENCE-START
12# FLOW-MAPPING-START
13# FLOW-SEQUENCE-END
14# FLOW-MAPPING-END
15# BLOCK-ENTRY
16# FLOW-ENTRY
17# KEY
18# VALUE
19# ALIAS(value)
20# ANCHOR(value)
21# TAG(value)
22# SCALAR(value, plain)
[57]23#
24# Read comments in the Scanner code for more details.
25#
[43]26
[46]27__all__ = ['Scanner', 'ScannerError']
[43]28
[52]29from error import MarkedYAMLError
[46]30from tokens import *
[39]31
[52]32class ScannerError(MarkedYAMLError):
33    pass
[51]34
[43]35class SimpleKey:
[51]36    # See below simple keys treatment.
37
[127]38    def __init__(self, token_number, required, index, line, column, mark=None,
39            inline=None):
[43]40        self.token_number = token_number
41        self.required = required
42        self.index = index
43        self.line = line
44        self.column = column
[116]45        self.mark = mark
[127]46        self.inline = inline
[43]47
[39]48class Scanner:
49
[46]50
51    def __init__(self, reader):
[39]52        """Initialize the scanner."""
[46]53        # The input stream. The Reader class do the dirty work of checking for
[43]54        # BOM and converting the input data to Unicode. It also adds NUL to
55        # the end.
[39]56        #
[46]57        # Reader supports the following methods
[48]58        #   self.reader.peek(i=0)       # peek the next i-th character
59        #   self.reader.prefix(l=1)     # peek the next l characters
60        #   self.reader.forward(l=1)    # read the next l characters
61                                        # and move the pointer
[46]62        self.reader = reader
[39]63
64        # Had we reached the end of the stream?
65        self.done = False
66
67        # The number of unclosed '{' and '['. `flow_level == 0` means block
68        # context.
69        self.flow_level = 0
70
71        # List of processed tokens that are not yet emitted.
72        self.tokens = []
73
[118]74        # Add the STREAM-START token.
75        self.fetch_stream_start()
76
[39]77        # Number of tokens that were emitted through the `get_token` method.
78        self.tokens_taken = 0
79
80        # The current indentation level.
81        self.indent = -1
82
83        # Past indentation levels.
84        self.indents = []
85
[127]86        # Used for providing style information to the parser.
87        self.current_line = self.previous_line = self.reader.line
88        self.current_column = self.previus_column = self.reader.column
89
[43]90        # Variables related to simple keys treatment.
[39]91
92        # A simple key is a key that is not denoted by the '?' indicator.
93        # Example of simple keys:
94        #   ---
95        #   block simple key: value
96        #   ? not a simple key:
97        #   : { flow simple key: value }
98        # We emit the KEY token before all keys, so when we find a potential
99        # simple key, we try to locate the corresponding ':' indicator.
100        # Simple keys should be limited to a single line and 1024 characters.
101
[43]102        # Can a simple key start at the current position? A simple key may
103        # start:
104        # - at the beginning of the line, not counting indentation spaces
105        #       (in block context),
106        # - after '{', '[', ',' (in the flow context),
107        # - after '?', ':', '-' (in the block context).
[60]108        # In the block context, this flag also signifies if a block collection
[43]109        # may start at the current position.
110        self.allow_simple_key = True
[39]111
112        # Keep track of possible simple keys. This is a dictionary. The key
113        # is `flow_level`; there can be no more that one possible simple key
[43]114        # for each level. The value is a SimpleKey record:
[116]115        #   (token_number, required, index, line, column, mark)
[43]116        # A simple key may start with ALIAS, ANCHOR, TAG, SCALAR(flow),
117        # '[', or '{' tokens.
[39]118        self.possible_simple_keys = {}
119
[51]120    # Public methods.
[39]121
[51]122    def check(self, *choices):
123        # Check if the next token is one of the given types.
[43]124        while self.need_more_tokens():
[39]125            self.fetch_more_tokens()
126        if self.tokens:
[51]127            for choice in choices:
128                if isinstance(self.tokens[0], choice):
129                    return True
130        return False
131
132    def peek(self):
133        # Return the next token, but do not delete if from the queue.
134        while self.need_more_tokens():
135            self.fetch_more_tokens()
136        if self.tokens:
[39]137            return self.tokens[0]
138
[51]139    def get(self):
140        # Return the next token.
[39]141        while self.need_more_tokens():
142            self.fetch_more_tokens()
143        if self.tokens:
144            self.tokens_taken += 1
145            return self.tokens.pop(0)
146
[51]147    def __iter__(self):
148        # Iterator protocol.
149        while self.need_more_tokens():
150            self.fetch_more_tokens()
151        while self.tokens:
152            self.tokens_taken += 1
153            yield self.tokens.pop(0)
154            while self.need_more_tokens():
155                self.fetch_more_tokens()
156
[43]157    # Private methods.
[39]158
159    def need_more_tokens(self):
160        if self.done:
161            return False
162        if not self.tokens:
163            return True
164        # The current token may be a potential simple key, so we
165        # need to look further.
[43]166        self.stale_possible_simple_keys()
[39]167        if self.next_possible_simple_key() == self.tokens_taken:
168            return True
169
170    def fetch_more_tokens(self):
171
172        # Eat whitespaces and comments until we reach the next token.
[43]173        self.scan_to_next_token()
[39]174
[43]175        # Remove obsolete possible simple keys.
176        self.stale_possible_simple_keys()
177
[39]178        # Compare the current indentation and column. It may add some tokens
[43]179        # and decrease the current indentation level.
[46]180        self.unwind_indent(self.reader.column)
[39]181
182        # Peek the next character.
[46]183        ch = self.reader.peek()
[39]184
[48]185        # Is it the end of stream?
[43]186        if ch == u'\0':
[48]187            return self.fetch_stream_end()
[39]188
189        # Is it a directive?
190        if ch == u'%' and self.check_directive():
191            return self.fetch_directive()
192
193        # Is it the document start?
194        if ch == u'-' and self.check_document_start():
195            return self.fetch_document_start()
196
197        # Is it the document end?
198        if ch == u'.' and self.check_document_end():
199            return self.fetch_document_end()
200
[52]201        # TODO: support for BOM within a stream.
202        #if ch == u'\uFEFF':
203        #    return self.fetch_bom()    <-- issue BOMToken
204
[39]205        # Note: the order of the following checks is NOT significant.
206
207        # Is it the flow sequence start indicator?
208        if ch == u'[':
209            return self.fetch_flow_sequence_start()
210
211        # Is it the flow mapping start indicator?
212        if ch == u'{':
213            return self.fetch_flow_mapping_start()
214
215        # Is it the flow sequence end indicator?
216        if ch == u']':
217            return self.fetch_flow_sequence_end()
218
219        # Is it the flow mapping end indicator?
220        if ch == u'}':
221            return self.fetch_flow_mapping_end()
222
[51]223        # Is it the flow entry indicator?
224        if ch in u',':
225            return self.fetch_flow_entry()
[43]226
[51]227        # Is it the block entry indicator?
228        if ch in u'-' and self.check_block_entry():
229            return self.fetch_block_entry()
230
[39]231        # Is it the key indicator?
232        if ch == u'?' and self.check_key():
233            return self.fetch_key()
234
235        # Is it the value indicator?
236        if ch == u':' and self.check_value():
237            return self.fetch_value()
238
239        # Is it an alias?
240        if ch == u'*':
241            return self.fetch_alias()
242
243        # Is it an anchor?
244        if ch == u'&':
245            return self.fetch_anchor()
246
[43]247        # Is it a tag?
[39]248        if ch == u'!':
249            return self.fetch_tag()
250
[43]251        # Is it a literal scalar?
252        if ch == u'|' and not self.flow_level:
[39]253            return self.fetch_literal()
254
255        # Is it a folded scalar?
[43]256        if ch == u'>' and not self.flow_level:
[39]257            return self.fetch_folded()
258
259        # Is it a single quoted scalar?
260        if ch == u'\'':
261            return self.fetch_single()
262
263        # Is it a double quoted scalar?
264        if ch == u'\"':
265            return self.fetch_double()
266
[43]267        # It must be a plain scalar then.
[39]268        if self.check_plain():
269            return self.fetch_plain()
270
[43]271        # No? It's an error. Let's produce a nice error message.
[48]272        raise ScannerError("while scanning for the next token", None,
273                "found character %r that cannot start any token"
[116]274                % ch.encode('utf-8'), self.reader.get_mark())
[39]275
[43]276    # Simple keys treatment.
277
278    def next_possible_simple_key(self):
279        # Return the number of the nearest possible simple key. Actually we
280        # don't need to loop through the whole dictionary. We may replace it
281        # with the following code:
282        #   if not self.possible_simple_keys:
283        #       return None
284        #   return self.possible_simple_keys[
285        #           min(self.possible_simple_keys.keys())].token_number
286        min_token_number = None
287        for level in self.possible_simple_keys:
288            key = self.possible_simple_keys[level]
289            if min_token_number is None or key.token_number < min_token_number:
290                min_token_number = key.token_number
291        return min_token_number
292
293    def stale_possible_simple_keys(self):
294        # Remove entries that are no longer possible simple keys. According to
295        # the YAML specification, simple keys
296        # - should be limited to a single line,
297        # - should be no longer than 1024 characters.
298        # Disabling this procedure will allow simple keys of any length and
299        # height (may cause problems if indentation is broken though).
300        for level in self.possible_simple_keys.keys():
301            key = self.possible_simple_keys[level]
[46]302            if key.line != self.reader.line  \
303                    or self.reader.index-key.index > 1024:
[43]304                if key.required:
[116]305                    raise ScannerError("while scanning a simple key", key.mark,
306                            "could not found expected ':'", self.reader.get_mark())
[43]307                del self.possible_simple_keys[level]
308
309    def save_possible_simple_key(self):
310        # The next token may start a simple key. We check if it's possible
311        # and save its position. This function is called for
312        #   ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'.
313
314        # Check if a simple key is required at the current position.
[46]315        required = not self.flow_level and self.indent == self.reader.column
[43]316
[47]317        # A simple key is required only if it is the first token in the current
318        # line. Therefore it is always allowed.
319        assert self.allow_simple_key or not required
320
[43]321        # The next token might be a simple key. Let's save it's number and
322        # position.
323        if self.allow_simple_key:
324            self.remove_possible_simple_key()
325            token_number = self.tokens_taken+len(self.tokens)
[46]326            index = self.reader.index
327            line = self.reader.line
328            column = self.reader.column
[116]329            mark = self.reader.get_mark()
[127]330            inline = (self.current_line == self.previous_line)
[43]331            key = SimpleKey(token_number, required,
[127]332                    index, line, column, mark, inline)
[43]333            self.possible_simple_keys[self.flow_level] = key
334
335    def remove_possible_simple_key(self):
336        # Remove the saved possible key position at the current flow level.
337        if self.flow_level in self.possible_simple_keys:
338            key = self.possible_simple_keys[self.flow_level]
[47]339           
340            # I don't think it's possible, but I could be wrong.
341            assert not key.required
342            #if key.required:
[116]343            #    raise ScannerError("while scanning a simple key", key.mark,
344            #            "could not found expected ':'", self.reader.get_mark())
[43]345
346    # Indentation functions.
347
348    def unwind_indent(self, column):
349
[117]350        ## In flow context, tokens should respect indentation.
351        ## Actually the condition should be `self.indent >= column` according to
352        ## the spec. But this condition will prohibit intuitively correct
353        ## constructions such as
354        ## key : {
355        ## }
356        #if self.flow_level and self.indent > column:
357        #    raise ScannerError(None, None,
358        #            "invalid intendation or unclosed '[' or '{'",
359        #            self.reader.get_mark())
[43]360
[117]361        # In the flow context, indentation is ignored. We make the scanner less
362        # restrictive then specification requires.
363        if self.flow_level:
364            return
365
[43]366        # In block context, we may need to issue the BLOCK-END tokens.
367        while self.indent > column:
[116]368            mark = self.reader.get_mark()
[43]369            self.indent = self.indents.pop()
[116]370            self.tokens.append(BlockEndToken(mark, mark))
[43]371
372    def add_indent(self, column):
373        # Check if we need to increase indentation.
374        if self.indent < column:
375            self.indents.append(self.indent)
376            self.indent = column
377            return True
378        return False
379
380    # Fetchers.
381
[118]382    def fetch_stream_start(self):
383        # We always add STREAM-START as the first token and STREAM-END as the
384        # last token.
385
386        # Read the token.
387        mark = self.reader.get_mark()
388       
389        # Add STREAM-END.
[127]390        self.tokens.append(StreamStartToken(mark, mark,
391            encoding=self.reader.encoding))
[118]392
[48]393    def fetch_stream_end(self):
[39]394
395        # Set the current intendation to -1.
[43]396        self.unwind_indent(-1)
[39]397
398        # Reset everything (not really needed).
[43]399        self.allow_simple_key = False
[39]400        self.possible_simple_keys = {}
401
[43]402        # Read the token.
[116]403        mark = self.reader.get_mark()
[43]404       
[118]405        # Add STREAM-END.
[116]406        self.tokens.append(StreamEndToken(mark, mark))
[39]407
[46]408        # The reader is ended.
[39]409        self.done = True
410
[43]411    def fetch_directive(self):
412       
413        # Set the current intendation to -1.
414        self.unwind_indent(-1)
[39]415
[43]416        # Reset simple keys.
417        self.remove_possible_simple_key()
418        self.allow_simple_key = False
[39]419
[43]420        # Scan and add DIRECTIVE.
[47]421        self.tokens.append(self.scan_directive())
[39]422
423    def fetch_document_start(self):
[44]424        self.fetch_document_indicator(DocumentStartToken)
[39]425
[43]426    def fetch_document_end(self):
[44]427        self.fetch_document_indicator(DocumentEndToken)
[43]428
429    def fetch_document_indicator(self, TokenClass):
430
[39]431        # Set the current intendation to -1.
[43]432        self.unwind_indent(-1)
[39]433
[43]434        # Reset simple keys. Note that there could not be a block collection
435        # after '---'.
436        self.remove_possible_simple_key()
437        self.allow_simple_key = False
[39]438
[43]439        # Add DOCUMENT-START or DOCUMENT-END.
[116]440        start_mark = self.reader.get_mark()
[46]441        self.reader.forward(3)
[116]442        end_mark = self.reader.get_mark()
443        self.tokens.append(TokenClass(start_mark, end_mark))
[39]444
[43]445    def fetch_flow_sequence_start(self):
[44]446        self.fetch_flow_collection_start(FlowSequenceStartToken)
[39]447
[43]448    def fetch_flow_mapping_start(self):
[44]449        self.fetch_flow_collection_start(FlowMappingStartToken)
[43]450
451    def fetch_flow_collection_start(self, TokenClass):
452
[44]453        # '[' and '{' may start a simple key.
454        self.save_possible_simple_key()
455
[43]456        # Increase the flow level.
457        self.flow_level += 1
458
459        # Simple keys are allowed after '[' and '{'.
460        self.allow_simple_key = True
461
462        # Add FLOW-SEQUENCE-START or FLOW-MAPPING-START.
[116]463        start_mark = self.reader.get_mark()
[46]464        self.reader.forward()
[116]465        end_mark = self.reader.get_mark()
466        self.tokens.append(TokenClass(start_mark, end_mark))
[39]467
[43]468    def fetch_flow_sequence_end(self):
[44]469        self.fetch_flow_collection_end(FlowSequenceEndToken)
[39]470
[43]471    def fetch_flow_mapping_end(self):
[44]472        self.fetch_flow_collection_end(FlowMappingEndToken)
[43]473
474    def fetch_flow_collection_end(self, TokenClass):
475
476        # Reset possible simple key on the current level.
477        self.remove_possible_simple_key()
478
479        # Decrease the flow level.
480        self.flow_level -= 1
481
482        # No simple keys after ']' or '}'.
483        self.allow_simple_key = False
484
485        # Add FLOW-SEQUENCE-END or FLOW-MAPPING-END.
[116]486        start_mark = self.reader.get_mark()
[46]487        self.reader.forward()
[116]488        end_mark = self.reader.get_mark()
489        self.tokens.append(TokenClass(start_mark, end_mark))
[39]490
[51]491    def fetch_flow_entry(self):
[39]492
[51]493        # Simple keys are allowed after ','.
494        self.allow_simple_key = True
495
496        # Reset possible simple key on the current level.
497        self.remove_possible_simple_key()
498
499        # Add FLOW-ENTRY.
[116]500        start_mark = self.reader.get_mark()
[51]501        self.reader.forward()
[116]502        end_mark = self.reader.get_mark()
503        self.tokens.append(FlowEntryToken(start_mark, end_mark))
[51]504
505    def fetch_block_entry(self):
506
[43]507        # Block context needs additional checks.
508        if not self.flow_level:
[39]509
[43]510            # Are we allowed to start a new entry?
511            if not self.allow_simple_key:
[47]512                raise ScannerError(None, None,
513                        "sequence entries are not allowed here",
[116]514                        self.reader.get_mark())
[39]515
[43]516            # We may need to add BLOCK-SEQUENCE-START.
[46]517            if self.add_indent(self.reader.column):
[116]518                mark = self.reader.get_mark()
[127]519                inline = (self.current_line == self.previous_line)
520                self.tokens.append(BlockSequenceStartToken(mark, mark, inline))
[39]521
[51]522        # It's an error for the block entry to occur in the flow context,
523        # but we let the parser detect this.
524        else:
525            pass
526
527        # Simple keys are allowed after '-'.
[43]528        self.allow_simple_key = True
[39]529
[43]530        # Reset possible simple key on the current level.
531        self.remove_possible_simple_key()
[39]532
[51]533        # Add BLOCK-ENTRY.
[116]534        start_mark = self.reader.get_mark()
[46]535        self.reader.forward()
[116]536        end_mark = self.reader.get_mark()
537        self.tokens.append(BlockEntryToken(start_mark, end_mark))
[39]538
[43]539    def fetch_key(self):
540       
541        # Block context needs additional checks.
542        if not self.flow_level:
[39]543
[43]544            # Are we allowed to start a key (not nessesary a simple)?
545            if not self.allow_simple_key:
[47]546                raise ScannerError(None, None,
547                        "mapping keys are not allowed here",
[116]548                        self.reader.get_mark())
[43]549
550            # We may need to add BLOCK-MAPPING-START.
[46]551            if self.add_indent(self.reader.column):
[116]552                mark = self.reader.get_mark()
[127]553                inline = (self.current_line == self.previous_line)
554                self.tokens.append(BlockMappingStartToken(mark, mark, inline))
[43]555
556        # Simple keys are allowed after '?' in the block context.
557        self.allow_simple_key = not self.flow_level
558
559        # Reset possible simple key on the current level.
560        self.remove_possible_simple_key()
561
562        # Add KEY.
[116]563        start_mark = self.reader.get_mark()
[46]564        self.reader.forward()
[116]565        end_mark = self.reader.get_mark()
566        self.tokens.append(KeyToken(start_mark, end_mark))
[39]567
[43]568    def fetch_value(self):
[39]569
[43]570        # Do we determine a simple key?
571        if self.flow_level in self.possible_simple_keys:
[39]572
[43]573            # Add KEY.
574            key = self.possible_simple_keys[self.flow_level]
575            del self.possible_simple_keys[self.flow_level]
576            self.tokens.insert(key.token_number-self.tokens_taken,
[116]577                    KeyToken(key.mark, key.mark))
[39]578
[43]579            # If this key starts a new block mapping, we need to add
580            # BLOCK-MAPPING-START.
581            if not self.flow_level:
582                if self.add_indent(key.column):
583                    self.tokens.insert(key.token_number-self.tokens_taken,
[127]584                            BlockMappingStartToken(key.mark, key.mark,
585                                key.inline))
[37]586
[43]587            # There cannot be two simple keys one after another.
588            self.allow_simple_key = False
[37]589
[43]590        # It must be a part of a complex key.
591        else:
592           
[47]593            # Block context needs additional checks.
594            # (Do we really need them? They will be catched by the parser
595            # anyway.)
596            if not self.flow_level:
597
598                # We are allowed to start a complex value if and only if
599                # we can start a simple key.
600                if not self.allow_simple_key:
601                    raise ScannerError(None, None,
602                            "mapping values are not allowed here",
[116]603                            self.reader.get_mark())
[47]604
[43]605            # Simple keys are allowed after ':' in the block context.
606            self.allow_simple_key = not self.flow_level
[37]607
[43]608            # Reset possible simple key on the current level.
609            self.remove_possible_simple_key()
[37]610
[43]611        # Add VALUE.
[116]612        start_mark = self.reader.get_mark()
[46]613        self.reader.forward()
[116]614        end_mark = self.reader.get_mark()
615        self.tokens.append(ValueToken(start_mark, end_mark))
[37]616
[43]617    def fetch_alias(self):
[37]618
[43]619        # ALIAS could be a simple key.
620        self.save_possible_simple_key()
[37]621
[43]622        # No simple keys after ALIAS.
623        self.allow_simple_key = False
[37]624
[43]625        # Scan and add ALIAS.
[47]626        self.tokens.append(self.scan_anchor(AliasToken))
[37]627
[43]628    def fetch_anchor(self):
[37]629
[43]630        # ANCHOR could start a simple key.
631        self.save_possible_simple_key()
[37]632
[43]633        # No simple keys after ANCHOR.
634        self.allow_simple_key = False
[37]635
[43]636        # Scan and add ANCHOR.
[47]637        self.tokens.append(self.scan_anchor(AnchorToken))
[37]638
[43]639    def fetch_tag(self):
[37]640
[43]641        # TAG could start a simple key.
642        self.save_possible_simple_key()
[37]643
[43]644        # No simple keys after TAG.
645        self.allow_simple_key = False
[37]646
[43]647        # Scan and add TAG.
[47]648        self.tokens.append(self.scan_tag())
[37]649
[43]650    def fetch_literal(self):
651        self.fetch_block_scalar(folded=False)
[37]652
[43]653    def fetch_folded(self):
654        self.fetch_block_scalar(folded=True)
[37]655
[43]656    def fetch_block_scalar(self, folded):
[37]657
[43]658        # A simple key may follow a block scalar.
659        self.allow_simple_key = True
[37]660
[43]661        # Reset possible simple key on the current level.
662        self.remove_possible_simple_key()
[37]663
[43]664        # Scan and add SCALAR.
[47]665        self.tokens.append(self.scan_block_scalar(folded))
[37]666
[43]667    def fetch_single(self):
668        self.fetch_flow_scalar(double=False)
[37]669
[43]670    def fetch_double(self):
671        self.fetch_flow_scalar(double=True)
[37]672
[43]673    def fetch_flow_scalar(self, double):
[37]674
[43]675        # A flow scalar could be a simple key.
676        self.save_possible_simple_key()
[37]677
[43]678        # No simple keys after flow scalars.
679        self.allow_simple_key = False
[37]680
[43]681        # Scan and add SCALAR.
[47]682        self.tokens.append(self.scan_flow_scalar(double))
[37]683
[43]684    def fetch_plain(self):
[37]685
[43]686        # A plain scalar could be a simple key.
687        self.save_possible_simple_key()
[37]688
[43]689        # No simple keys after plain scalars. But note that `scan_plain` will
690        # change this flag if the scan is finished at the beginning of the
691        # line.
692        self.allow_simple_key = False
[37]693
[43]694        # Scan and add SCALAR. May change `allow_simple_key`.
[47]695        self.tokens.append(self.scan_plain())
[37]696
[43]697    # Checkers.
[37]698
[43]699    def check_directive(self):
[37]700
[43]701        # DIRECTIVE:        ^ '%' ...
702        # The '%' indicator is already checked.
[46]703        if self.reader.column == 0:
[43]704            return True
[37]705
[43]706    def check_document_start(self):
[37]707
[43]708        # DOCUMENT-START:   ^ '---' (' '|'\n')
[46]709        if self.reader.column == 0:
[48]710            if self.reader.prefix(3) == u'---'  \
711                    and self.reader.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
[43]712                return True
[37]713
[43]714    def check_document_end(self):
[37]715
[43]716        # DOCUMENT-END:     ^ '...' (' '|'\n')
[46]717        if self.reader.column == 0:
718            prefix = self.reader.peek(4)
[48]719            if self.reader.prefix(3) == u'...'  \
720                    and self.reader.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
[43]721                return True
[37]722
[51]723    def check_block_entry(self):
[43]724
[51]725        # BLOCK-ENTRY:      '-' (' '|'\n')
726        return self.reader.peek(1) in u'\0 \t\r\n\x85\u2028\u2029'
[43]727
728    def check_key(self):
729
730        # KEY(flow context):    '?'
731        if self.flow_level:
[37]732            return True
[43]733
734        # KEY(block context):   '?' (' '|'\n')
[37]735        else:
[48]736            return self.reader.peek(1) in u'\0 \t\r\n\x85\u2028\u2029'
[37]737
[43]738    def check_value(self):
739
740        # VALUE(flow context):  ':'
741        if self.flow_level:
[37]742            return True
[43]743
744        # VALUE(block context): ':' (' '|'\n')
[37]745        else:
[48]746            return self.reader.peek(1) in u'\0 \t\r\n\x85\u2028\u2029'
[37]747
[43]748    def check_plain(self):
[37]749
[48]750        # A plain scalar may start with any non-space character except:
751        #   '-', '?', ':', ',', '[', ']', '{', '}',
752        #   '#', '&', '*', '!', '|', '>', '\'', '\"',
753        #   '%', '@', '`'.
754        #
755        # It may also start with
756        #   '-', '?', ':'
757        # if it is followed by a non-space character.
758        #
759        # Note that we limit the last rule to the block context (except the
760        # '-' character) because we want the flow context to be space
761        # independent.
762        ch = self.reader.peek()
763        return ch not in u'\0 \t\r\n\x85\u2028\u2029-?:,[]{}#&*!|>\'\"%@`'  \
764                or (self.reader.peek(1) not in u'\0 \t\r\n\x85\u2028\u2029'
765                        and (ch == '-' or (not self.flow_level and ch in u'?:')))
766
[43]767    # Scanners.
768
769    def scan_to_next_token(self):
[47]770        # We ignore spaces, line breaks and comments.
771        # If we find a line break in the block context, we set the flag
772        # `allow_simple_key` on.
[51]773        # The byte order mark is stripped if it's the first character in the
774        # stream. We do not yet support BOM inside the stream as the
775        # specification requires. Any such mark will be considered as a part
776        # of the document.
[52]777        #
778        # TODO: We need to make tab handling rules more sane. A good rule is
779        #   Tabs cannot precede tokens
780        #   BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END,
781        #   KEY(block), VALUE(block), BLOCK-ENTRY
782        # So the checking code is
783        #   if <TAB>:
784        #       self.allow_simple_keys = False
785        # We also need to add the check for `allow_simple_keys == True` to
786        # `unwind_indent` before issuing BLOCK-END.
787        # Scanners for block, flow, and plain scalars need to be modified.
788
[51]789        if self.reader.index == 0 and self.reader.peek() == u'\uFEFF':
790            self.reader.forward()
[43]791        found = False
792        while not found:
[46]793            while self.reader.peek() == u' ':
794                self.reader.forward()
795            if self.reader.peek() == u'#':
[47]796                while self.reader.peek() not in u'\0\r\n\x85\u2028\u2029':
[46]797                    self.reader.forward()
[47]798            if self.scan_line_break():
[43]799                if not self.flow_level:
800                    self.allow_simple_key = True
[37]801            else:
[43]802                found = True
[37]803
[127]804        self.previous_line = self.current_line
805        self.previous_column = self.current_column
806        self.current_line = self.reader.line
807        self.current_column = self.reader.column
808
[43]809    def scan_directive(self):
[48]810        # See the specification for details.
[116]811        start_mark = self.reader.get_mark()
[48]812        self.reader.forward()
[116]813        name = self.scan_directive_name(start_mark)
[48]814        value = None
815        if name == u'YAML':
[116]816            value = self.scan_yaml_directive_value(start_mark)
817            end_mark = self.reader.get_mark()
[48]818        elif name == u'TAG':
[116]819            value = self.scan_tag_directive_value(start_mark)
820            end_mark = self.reader.get_mark()
[43]821        else:
[116]822            end_mark = self.reader.get_mark()
[48]823            while self.reader.peek() not in u'\0\r\n\x85\u2028\u2029':
824                self.reader.forward()
[116]825        self.scan_directive_ignored_line(start_mark)
826        return DirectiveToken(name, value, start_mark, end_mark)
[48]827
[116]828    def scan_directive_name(self, start_mark):
[48]829        # See the specification for details.
830        length = 0
831        ch = self.reader.peek(length)
832        while u'0' <= ch <= u'9' or u'A' <= ch <= 'Z' or u'a' <= ch <= 'z'  \
833                or ch in u'-_':
834            length += 1
835            ch = self.reader.peek(length)
836        if not length:
[116]837            raise ScannerError("while scanning a directive", start_mark,
[52]838                    "expected alphabetic or numeric character, but found %r"
[116]839                    % ch.encode('utf-8'), self.reader.get_mark())
[48]840        value = self.reader.prefix(length)
841        self.reader.forward(length)
842        ch = self.reader.peek()
843        if ch not in u'\0 \r\n\x85\u2028\u2029':
[116]844            raise ScannerError("while scanning a directive", start_mark,
[48]845                    "expected alphabetic or numeric character, but found %r"
[116]846                    % ch.encode('utf-8'), self.reader.get_mark())
[48]847        return value
848
[116]849    def scan_yaml_directive_value(self, start_mark):
[48]850        # See the specification for details.
851        while self.reader.peek() == u' ':
[46]852            self.reader.forward()
[116]853        major = self.scan_yaml_directive_number(start_mark)
[48]854        if self.reader.peek() != '.':
[116]855            raise ScannerError("while scanning a directive", start_mark,
[52]856                    "expected a digit or '.', but found %r"
857                    % self.reader.peek().encode('utf-8'),
[116]858                    self.reader.get_mark())
[46]859        self.reader.forward()
[116]860        minor = self.scan_yaml_directive_number(start_mark)
[48]861        if self.reader.peek() not in u'\0 \r\n\x85\u2028\u2029':
[116]862            raise ScannerError("while scanning a directive", start_mark,
[52]863                    "expected a digit or ' ', but found %r"
864                    % self.reader.peek().encode('utf-8'),
[116]865                    self.reader.get_mark())
[48]866        return (major, minor)
[37]867
[116]868    def scan_yaml_directive_number(self, start_mark):
[48]869        # See the specification for details.
870        ch = self.reader.peek()
871        if not (u'0' <= ch <= '9'):
[116]872            raise ScannerError("while scanning a directive", start_mark,
[48]873                    "expected a digit, but found %r" % ch.encode('utf-8'),
[116]874                    self.reader.get_mark())
[48]875        length = 0
876        while u'0' <= self.reader.peek(length) <= u'9':
877            length += 1
878        value = int(self.reader.prefix(length))
879        self.reader.forward(length)
880        return value
881
[116]882    def scan_tag_directive_value(self, start_mark):
[48]883        # See the specification for details.
884        while self.reader.peek() == u' ':
885            self.reader.forward()
[116]886        handle = self.scan_tag_directive_handle(start_mark)
[48]887        while self.reader.peek() == u' ':
888            self.reader.forward()
[116]889        prefix = self.scan_tag_directive_prefix(start_mark)
[48]890        return (handle, prefix)
891
[116]892    def scan_tag_directive_handle(self, start_mark):
[48]893        # See the specification for details.
[116]894        value = self.scan_tag_handle('directive', start_mark)
[52]895        ch = self.reader.peek()
896        if ch != u' ':
[116]897            raise ScannerError("while scanning a directive", start_mark,
[48]898                    "expected ' ', but found %r" % ch.encode('utf-8'),
[116]899                    self.reader.get_mark())
[48]900        return value
901
[116]902    def scan_tag_directive_prefix(self, start_mark):
[48]903        # See the specification for details.
[116]904        value = self.scan_tag_uri('directive', start_mark)
[48]905        ch = self.reader.peek()
906        if ch not in u'\0 \r\n\x85\u2028\u2029':
[116]907            raise ScannerError("while scanning a directive", start_mark,
[48]908                    "expected ' ', but found %r" % ch.encode('utf-8'),
[116]909                    self.reader.get_mark())
[48]910        return value
911
[116]912    def scan_directive_ignored_line(self, start_mark):
[48]913        # See the specification for details.
914        while self.reader.peek() == u' ':
915            self.reader.forward()
916        if self.reader.peek() == u'#':
917            while self.reader.peek() not in u'\0\r\n\x85\u2028\u2029':
918                self.reader.forward()
919        ch = self.reader.peek()
920        if ch not in u'\0\r\n\x85\u2028\u2029':
[116]921            raise ScannerError("while scanning a directive", start_mark,
[48]922                    "expected a comment or a line break, but found %r"
[116]923                        % ch.encode('utf-8'), self.reader.get_mark())
[48]924        self.scan_line_break()
925
[43]926    def scan_anchor(self, TokenClass):
[48]927        # The specification does not restrict characters for anchors and
928        # aliases. This may lead to problems, for instance, the document:
929        #   [ *alias, value ]
930        # can be interpteted in two ways, as
931        #   [ "value" ]
932        # and
933        #   [ *alias , "value" ]
934        # Therefore we restrict aliases to numbers and ASCII letters.
[116]935        start_mark = self.reader.get_mark()
[48]936        indicator = self.reader.peek()
937        if indicator == '*':
938            name = 'alias'
939        else:
940            name = 'anchor'
941        self.reader.forward()
942        length = 0
943        ch = self.reader.peek(length)
944        while u'0' <= ch <= u'9' or u'A' <= ch <= 'Z' or u'a' <= ch <= 'z'  \
945                or ch in u'-_':
946            length += 1
947            ch = self.reader.peek(length)
948        if not length:
[116]949            raise ScannerError("while scanning an %s" % name, start_mark,
[52]950                    "expected alphabetic or numeric character, but found %r"
[116]951                    % ch.encode('utf-8'), self.reader.get_mark())
[48]952        value = self.reader.prefix(length)
953        self.reader.forward(length)
954        ch = self.reader.peek()
955        if ch not in u'\0 \t\r\n\x85\u2028\u2029?:,]}%@`':
[116]956            raise ScannerError("while scanning an %s" % name, start_mark,
[48]957                    "expected alphabetic or numeric character, but found %r"
[116]958                    % ch.encode('utf-8'), self.reader.get_mark())
959        end_mark = self.reader.get_mark()
960        return TokenClass(value, start_mark, end_mark)
[37]961
[43]962    def scan_tag(self):
[48]963        # See the specification for details.
[116]964        start_mark = self.reader.get_mark()
[48]965        ch = self.reader.peek(1)
966        if ch == u'<':
967            handle = None
968            self.reader.forward(2)
[116]969            suffix = self.scan_tag_uri('tag', start_mark)
[48]970            if self.reader.peek() != u'>':
[116]971                raise ScannerError("while parsing a tag", start_mark,
[52]972                        "expected '>', but found %r" % self.reader.peek().encode('utf-8'),
[116]973                        self.reader.get_mark())
[46]974            self.reader.forward()
[48]975        elif ch in u'\0 \t\r\n\x85\u2028\u2029':
976            handle = None
977            suffix = u'!'
978            self.reader.forward()
979        else:
980            length = 1
981            use_handle = False
982            while ch not in u'\0 \r\n\x85\u2028\u2029':
983                if ch == u'!':
984                    use_handle = True
985                    break
986                length += 1
987                ch = self.reader.peek(length)
988            handle = u'!'
989            if use_handle:
[116]990                handle = self.scan_tag_handle('tag', start_mark)
[48]991            else:
992                handle = u'!'
993                self.reader.forward()
[116]994            suffix = self.scan_tag_uri('tag', start_mark)
[48]995        ch = self.reader.peek()
996        if ch not in u'\0 \r\n\x85\u2028\u2029':
[116]997            raise ScannerError("while scanning a tag", start_mark,
[48]998                    "expected ' ', but found %r" % ch.encode('utf-8'),
[116]999                    self.reader.get_mark())
[48]1000        value = (handle, suffix)
[116]1001        end_mark = self.reader.get_mark()
1002        return TagToken(value, start_mark, end_mark)
[43]1003
1004    def scan_block_scalar(self, folded):
[48]1005        # See the specification for details.
1006
1007        chunks = []
[116]1008        start_mark = self.reader.get_mark()
[48]1009
1010        # Scan the header.
1011        self.reader.forward()
[116]1012        chomping, increment = self.scan_block_scalar_indicators(start_mark)
1013        self.scan_block_scalar_ignored_line(start_mark)
[48]1014
1015        # Determine the indentation level and go to the first non-empty line.
1016        min_indent = self.indent+1
1017        if min_indent < 1:
1018            min_indent = 1
1019        if increment is None:
[116]1020            breaks, max_indent, end_mark = self.scan_block_scalar_indentation()
[48]1021            indent = max(min_indent, max_indent)
1022        else:
1023            indent = min_indent+increment-1
[116]1024            breaks, end_mark = self.scan_block_scalar_breaks(indent)
[48]1025        line_break = u''
1026
1027        # Scan the inner part of the block scalar.
1028        while self.reader.column == indent and self.reader.peek() != u'\0':
1029            chunks.extend(breaks)
1030            leading_non_space = self.reader.peek() not in u' \t'
1031            length = 0
1032            while self.reader.peek(length) not in u'\0\r\n\x85\u2028\u2029':
1033                length += 1
1034            chunks.append(self.reader.prefix(length))
1035            self.reader.forward(length)
1036            line_break = self.scan_line_break()
[116]1037            breaks, end_mark = self.scan_block_scalar_breaks(indent)
[48]1038            if self.reader.column == indent and self.reader.peek() != u'\0':
1039                # Unfortunately, folding rules are ambiguous.
1040                #
1041                # This is the folding according to the specification:
[51]1042               
1043                if folded and line_break == u'\n'   \
1044                        and leading_non_space and self.reader.peek() not in u' \t':
1045                    if not breaks:
1046                        chunks.append(u' ')
1047                else:
1048                    chunks.append(line_break)
1049               
1050                # This is Clark Evans's interpretation (also in the spec
1051                # examples):
[48]1052                #
[51]1053                #if folded and line_break == u'\n':
[48]1054                #    if not breaks:
[51]1055                #        if self.reader.peek() not in ' \t':
1056                #            chunks.append(u' ')
1057                #        else:
1058                #            chunks.append(line_break)
[48]1059                #else:
1060                #    chunks.append(line_break)
1061            else:
1062                break
1063
1064        # Chomp the tail.
1065        if chomping is not False:
1066            chunks.append(line_break)
1067        if chomping is True:
1068            chunks.extend(breaks)
1069
1070        # We are done.
[127]1071        if folded:
1072            style = '>'
1073        else:
1074            style = '|'
1075        return ScalarToken(u''.join(chunks), False, start_mark, end_mark,
1076                style)
[48]1077
[116]1078    def scan_block_scalar_indicators(self, start_mark):
[48]1079        # See the specification for details.
1080        chomping = None
1081        increment = None
1082        ch = self.reader.peek()
1083        if ch in u'+-':
1084            if ch == '+':
1085                chomping = True
1086            else:
1087                chomping = False
1088            self.reader.forward()
1089            ch = self.reader.peek()
1090            if ch in u'0123456789':
1091                increment = int(ch)
1092                if increment == 0:
[116]1093                    raise ScannerError("while scanning a block scalar", start_mark,
[48]1094                            "expected indentation indicator in the range 1-9, but found 0",
[116]1095                            self.reader.get_mark())
[46]1096                self.reader.forward()
[48]1097        elif ch in u'0123456789':
1098            increment = int(ch)
1099            if increment == 0:
[116]1100                raise ScannerError("while scanning a block scalar", start_mark,
[48]1101                        "expected indentation indicator in the range 1-9, but found 0",
[116]1102                        self.reader.get_mark())
[48]1103            self.reader.forward()
1104            ch = self.reader.peek()
1105            if ch in u'+-':
1106                if ch == '+':
1107                    chomping = True
1108                else:
1109                    chomping = False
[46]1110                self.reader.forward()
[48]1111        ch = self.reader.peek()
1112        if ch not in u'\0 \r\n\x85\u2028\u2029':
[116]1113            raise ScannerError("while scanning a block scalar", start_mark,
[48]1114                    "expected chomping or indentation indicators, but found %r"
[116]1115                        % ch.encode('utf-8'), self.reader.get_mark())
[48]1116        return chomping, increment
1117
[116]1118    def scan_block_scalar_ignored_line(self, start_mark):
[48]1119        # See the specification for details.
1120        while self.reader.peek() == u' ':
1121            self.reader.forward()
1122        if self.reader.peek() == u'#':
1123            while self.reader.peek() not in u'\0\r\n\x85\u2028\u2029':
[46]1124                self.reader.forward()
[48]1125        ch = self.reader.peek()
1126        if ch not in u'\0\r\n\x85\u2028\u2029':
[116]1127            raise ScannerError("while scanning a block scalar", start_mark,
[48]1128                    "expected a comment or a line break, but found %r"
[116]1129                        % ch.encode('utf-8'), self.reader.get_mark())
[48]1130        self.scan_line_break()
[43]1131
[48]1132    def scan_block_scalar_indentation(self):
1133        # See the specification for details.
1134        chunks = []
1135        max_indent = 0
[116]1136        end_mark = self.reader.get_mark()
[48]1137        while self.reader.peek() in u' \r\n\x85\u2028\u2029':
1138            if self.reader.peek() != u' ':
1139                chunks.append(self.scan_line_break())
[116]1140                end_mark = self.reader.get_mark()
[48]1141            else:
1142                self.reader.forward()
1143                if self.reader.column > max_indent:
1144                    max_indent = self.reader.column
[116]1145        return chunks, max_indent, end_mark
[48]1146
1147    def scan_block_scalar_breaks(self, indent):
1148        # See the specification for details.
1149        chunks = []
[116]1150        end_mark = self.reader.get_mark()
[48]1151        while self.reader.column < indent and self.reader.peek() == u' ':
1152            self.reader.forward()
1153        while self.reader.peek() in u'\r\n\x85\u2028\u2029':
1154            chunks.append(self.scan_line_break())
[116]1155            end_mark = self.reader.get_mark()
[48]1156            while self.reader.column < indent and self.reader.peek() == u' ':
1157                self.reader.forward()
[116]1158        return chunks, end_mark
[48]1159
[43]1160    def scan_flow_scalar(self, double):
[48]1161        # See the specification for details.
[117]1162        # Note that we loose indentation rules for quoted scalars. Quoted
1163        # scalars don't need to adhere indentation because " and ' clearly
1164        # mark the beginning and the end of them. Therefore we are less
1165        # restrictive then the specification requires. We only need to check
1166        # that document separators are not included in scalars.
[48]1167        chunks = []
[116]1168        start_mark = self.reader.get_mark()
[46]1169        quote = self.reader.peek()
1170        self.reader.forward()
[117]1171        chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark))
[46]1172        while self.reader.peek() != quote:
[117]1173            chunks.extend(self.scan_flow_scalar_spaces(double, start_mark))
1174            chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark))
[48]1175        self.reader.forward()
[116]1176        end_mark = self.reader.get_mark()
[127]1177        if double:
1178            style = '"'
1179        else:
1180            style = '\''
1181        return ScalarToken(u''.join(chunks), False, start_mark, end_mark,
1182                style)
[48]1183
1184    ESCAPE_REPLACEMENTS = {
1185        u'0':   u'\0',
1186        u'a':   u'\x07',
1187        u'b':   u'\x08',
1188        u't':   u'\x09',
1189        u'\t':  u'\x09',
1190        u'n':   u'\x0A',
1191        u'v':   u'\x0B',
1192        u'f':   u'\x0C',
1193        u'r':   u'\x0D',
1194        u'e':   u'\x1B',
1195        u' ':   u'\x20',
1196        u'\"':  u'\"',
1197        u'\\':  u'\\',
1198        u'N':   u'\x85',
1199        u'_':   u'\xA0',
1200        u'L':   u'\u2028',
1201        u'P':   u'\u2029',
1202    }
1203
1204    ESCAPE_CODES = {
1205        u'x':   2,
1206        u'u':   4,
1207        u'U':   8,
1208    }
1209
[117]1210    def scan_flow_scalar_non_spaces(self, double, start_mark):
[48]1211        # See the specification for details.
1212        chunks = []
1213        while True:
1214            length = 0
1215            while self.reader.peek(length) not in u'\'\"\\\0 \t\r\n\x85\u2028\u2029':
1216                length += 1
1217            if length:
1218                chunks.append(self.reader.prefix(length))
1219                self.reader.forward(length)
1220            ch = self.reader.peek()
1221            if not double and ch == u'\'' and self.reader.peek(1) == u'\'':
1222                chunks.append(u'\'')
[46]1223                self.reader.forward(2)
[48]1224            elif (double and ch == u'\'') or (not double and ch in u'\"\\'):
1225                chunks.append(ch)
1226                self.reader.forward()
1227            elif double and ch == u'\\':
1228                self.reader.forward()
1229                ch = self.reader.peek()
1230                if ch in self.ESCAPE_REPLACEMENTS:
1231                    chunks.append(self.ESCAPE_REPLACEMENTS[ch])
1232                    self.reader.forward()
1233                elif ch in self.ESCAPE_CODES:
1234                    length = self.ESCAPE_CODES[ch]
1235                    self.reader.forward()
1236                    for k in range(length):
1237                        if self.reader.peek(k) not in u'0123456789ABCDEFabcdef':
[116]1238                            raise ScannerError("while scanning a double-quoted scalar", start_mark,
[48]1239                                    "expected escape sequence of %d hexdecimal numbers, but found %r" %
[116]1240                                        (length, self.reader.peek(k).encode('utf-8')), self.reader.get_mark())
[48]1241                    code = int(self.reader.prefix(length), 16)
1242                    chunks.append(unichr(code))
1243                    self.reader.forward(length)
1244                elif ch in u'\r\n\x85\u2028\u2029':
1245                    self.scan_line_break()
[117]1246                    chunks.extend(self.scan_flow_scalar_breaks(double, start_mark))
[48]1247                else:
[116]1248                    raise ScannerError("while scanning a double-quoted scalar", start_mark,
1249                            "found unknown escape character %r" % ch.encode('utf-8'), self.reader.get_mark())
[37]1250            else:
[48]1251                return chunks
[37]1252
[117]1253    def scan_flow_scalar_spaces(self, double, start_mark):
[48]1254        # See the specification for details.
1255        chunks = []
1256        length = 0
1257        while self.reader.peek(length) in u' \t':
1258            length += 1
1259        whitespaces = self.reader.prefix(length)
1260        self.reader.forward(length)
1261        ch = self.reader.peek()
1262        if ch == u'\0':
[116]1263            raise ScannerError("while scanning a quoted scalar", start_mark,
1264                    "found unexpected end of stream", self.reader.get_mark())
[48]1265        elif ch in u'\r\n\x85\u2028\u2029':
1266            line_break = self.scan_line_break()
[117]1267            breaks = self.scan_flow_scalar_breaks(double, start_mark)
[48]1268            if line_break != u'\n':
1269                chunks.append(line_break)
1270            elif not breaks:
1271                chunks.append(u' ')
1272            chunks.extend(breaks)
1273        else:
1274            chunks.append(whitespaces)
1275        return chunks
1276
[117]1277    def scan_flow_scalar_breaks(self, double, start_mark):
[48]1278        # See the specification for details.
1279        chunks = []
1280        while True:
[117]1281            # Instead of checking indentation, we check for document
1282            # separators.
1283            prefix = self.reader.prefix(3)
1284            if (prefix == u'---' or prefix == u'...')   \
1285                    and self.reader.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
[116]1286                raise ScannerError("while scanning a quoted scalar", start_mark,
[117]1287                        "found unexpected document separator", self.reader.get_mark())
[48]1288            while self.reader.peek() in u' \t':
1289                self.reader.forward()
1290            if self.reader.peek() in u'\r\n\x85\u2028\u2029':
1291                chunks.append(self.scan_line_break())
1292            else:
1293                return chunks
1294
[43]1295    def scan_plain(self):
[48]1296        # See the specification for details.
1297        # We add an additional restriction for the flow context:
[117]1298        #   plain scalars in the flow context cannot contain ',', ':' and '?'.
[48]1299        # We also keep track of the `allow_simple_key` flag here.
[117]1300        # Indentation rules are loosed for the flow context.
[48]1301        chunks = []
[116]1302        start_mark = self.reader.get_mark()
1303        end_mark = start_mark
[43]1304        indent = self.indent+1
[117]1305        # We allow zero indentation for scalars, but then we need to check for
1306        # document separators at the beginning of the line.
1307        #if indent == 0:
1308        #    indent = 1
[48]1309        spaces = []
[43]1310        while True:
[48]1311            length = 0
1312            if self.reader.peek() == u'#':
[43]1313                break
[48]1314            while True:
1315                ch = self.reader.peek(length)
1316                if ch in u'\0 \t\r\n\x85\u2028\u2029'   \
1317                        or (not self.flow_level and ch == u':' and
1318                                self.reader.peek(length+1) in u'\0 \t\r\n\x28\u2028\u2029') \
1319                        or (self.flow_level and ch in u',:?[]{}'):
1320                    break
1321                length += 1
1322            if length == 0:
[43]1323                break
[48]1324            self.allow_simple_key = False
1325            chunks.extend(spaces)
1326            chunks.append(self.reader.prefix(length))
1327            self.reader.forward(length)
[116]1328            end_mark = self.reader.get_mark()
[117]1329            spaces = self.scan_plain_spaces(indent, start_mark)
[48]1330            if not spaces or self.reader.peek() == u'#' \
[117]1331                    or (not self.flow_level and self.reader.column < indent):
[48]1332                break
[127]1333        return ScalarToken(u''.join(chunks), True, start_mark, end_mark, '')
[37]1334
[117]1335    def scan_plain_spaces(self, indent, start_mark):
[48]1336        # See the specification for details.
1337        # The specification is really confusing about tabs in plain scalars.
1338        # We just forbid them completely. Do not use tabs in YAML!
1339        chunks = []
1340        length = 0
1341        while self.reader.peek(length) in u' ':
1342            length += 1
1343        whitespaces = self.reader.prefix(length)
1344        self.reader.forward(length)
1345        ch = self.reader.peek()
1346        if ch in u'\r\n\x85\u2028\u2029':
1347            line_break = self.scan_line_break()
1348            self.allow_simple_key = True
[117]1349            prefix = self.reader.prefix(3)
1350            if (prefix == u'---' or prefix == u'...')   \
1351                    and self.reader.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
1352                return
[48]1353            breaks = []
1354            while self.reader.peek() in u' \r\n\x85\u2028\u2029':
1355                if self.reader.peek() == ' ':
1356                    self.reader.forward()
1357                else:
1358                    breaks.append(self.scan_line_break())
[117]1359                    prefix = self.reader.prefix(3)
1360                    if (prefix == u'---' or prefix == u'...')   \
1361                            and self.reader.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
1362                        return
[48]1363            if line_break != u'\n':
1364                chunks.append(line_break)
1365            elif not breaks:
1366                chunks.append(u' ')
1367            chunks.extend(breaks)
1368        elif whitespaces:
1369            chunks.append(whitespaces)
1370        return chunks
1371
[116]1372    def scan_tag_handle(self, name, start_mark):
[48]1373        # See the specification for details.
1374        # For some strange reasons, the specification does not allow '_' in
1375        # tag handles. I have allowed it anyway.
[52]1376        ch = self.reader.peek()
1377        if ch != u'!':
[116]1378            raise ScannerError("while scanning a %s" % name, start_mark,
[48]1379                    "expected '!', but found %r" % ch.encode('utf-8'),
[116]1380                    self.reader.get_mark())
[48]1381        length = 1
1382        ch = self.reader.peek(length)
1383        if ch != u' ':
1384            while u'0' <= ch <= u'9' or u'A' <= ch <= 'Z' or u'a' <= ch <= 'z'  \
1385                    or ch in u'-_':
1386                length += 1
1387                ch = self.reader.peek(length)
1388            if ch != u'!':
1389                self.reader.forward(length)
[116]1390                raise ScannerError("while scanning a %s" % name, start_mark,
[48]1391                        "expected '!', but found %r" % ch.encode('utf-8'),
[116]1392                        self.reader.get_mark())
[48]1393            length += 1
1394        value = self.reader.prefix(length)
1395        self.reader.forward(length)
1396        return value
1397
[116]1398    def scan_tag_uri(self, name, start_mark):
[48]1399        # See the specification for details.
1400        # Note: we do not check if URI is well-formed.
1401        chunks = []
1402        length = 0
1403        ch = self.reader.peek(length)
1404        while u'0' <= ch <= u'9' or u'A' <= ch <= 'Z' or u'a' <= ch <= 'z'  \
1405                or ch in u'-;/?:@&=+$,_.!~*\'()[]%':
1406            if ch == u'%':
1407                chunks.append(self.reader.prefix(length))
1408                self.reader.forward(length)
1409                length = 0
[116]1410                chunks.append(self.scan_uri_escapes(name, start_mark))
[48]1411            else:
1412                length += 1
1413            ch = self.reader.peek(length)
1414        if length:
1415            chunks.append(self.reader.prefix(length))
1416            self.reader.forward(length)
1417            length = 0
1418        if not chunks:
[116]1419            raise ScannerError("while parsing a %s" % name, start_mark,
[48]1420                    "expected URI, but found %r" % ch.encode('utf-8'),
[116]1421                    self.reader.get_mark())
[48]1422        return u''.join(chunks)
1423
[116]1424    def scan_uri_escapes(self, name, start_mark):
[48]1425        # See the specification for details.
1426        bytes = []
[116]1427        mark = self.reader.get_mark()
[48]1428        while self.reader.peek() == u'%':
1429            self.reader.forward()
1430            for k in range(2):
1431                if self.reader.peek(k) not in u'0123456789ABCDEFabcdef':
[116]1432                    raise ScannerError("while scanning a %s" % name, start_mark,
[48]1433                            "expected URI escape sequence of 2 hexdecimal numbers, but found %r" %
[116]1434                                (self.reader.peek(k).encode('utf-8')), self.reader.get_mark())
[48]1435            bytes.append(chr(int(self.reader.prefix(2), 16)))
1436            self.reader.forward(2)
1437        try:
1438            value = unicode(''.join(bytes), 'utf-8')
1439        except UnicodeDecodeError, exc:
[116]1440            raise ScannerError("while scanning a %s" % name, start_mark, str(exc), mark)
[48]1441        return value
1442
[47]1443    def scan_line_break(self):
1444        # Transforms:
1445        #   '\r\n'      :   '\n'
1446        #   '\r'        :   '\n'
1447        #   '\n'        :   '\n'
1448        #   '\x85'      :   '\n'
1449        #   '\u2028'    :   '\u2028'
1450        #   '\u2029     :   '\u2029'
1451        #   default     :   ''
1452        ch = self.reader.peek()
1453        if ch in u'\r\n\x85':
[48]1454            if self.reader.prefix(2) == u'\r\n':
[60]1455                self.reader.forward(2)
[47]1456            else:
1457                self.reader.forward()
1458            return u'\n'
1459        elif ch in u'\u2028\u2029':
1460            self.reader.forward()
1461            return ch
1462        return u''
1463
[45]1464#try:
1465#    import psyco
1466#    psyco.bind(Scanner)
1467#except ImportError:
1468#    pass
1469
Note: See TracBrowser for help on using the repository browser.