source: branches/pyyaml3000/lib/yaml/scanner.py @ 51

Revision 51, 51.6 KB checked in by xi, 9 years ago (diff)

Parser is done. Add iterator interfaces for Scanner and Parser.

RevLine 
[39]1
[43]2# Tokens:
3# YAML-DIRECTIVE(major_version, minor_version), TAG-DIRECTIVE(handle, prefix)
4# RESERVED-DIRECTIVE(name)
5# DOCUMENT-START, DOCUMENT-END
6# BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END
7# FLOW-SEQUENCE-START, FLOW-MAPPING-START, FLOW-SEQUENCE-END, FLOW-MAPPING-END
8# ENTRY, KEY, VALUE
9# ALIAS(name), ANCHOR(name), TAG(value), SCALAR(value, plain)
10
[46]11__all__ = ['Scanner', 'ScannerError']
[43]12
[46]13from error import YAMLError
14from tokens import *
[39]15
[46]16class ScannerError(YAMLError):
17    # ScannerError: while reading a quoted string
18    #         in '...', line 5, column 10:
19    # key: "valu\?e"
20    #      ^
21    # got unknown quote character '?'
22    #         in '...', line 5, column 15:
23    # key: "valu\?e"
24    #            ^
[51]25
[47]26    def __init__(self, context=None, context_marker=None,
[48]27            problem=None, problem_marker=None):
[47]28        self.context = context
29        self.context_marker = context_marker
30        self.problem = problem
31        self.problem_marker = problem_marker
[43]32
[47]33    def __str__(self):
34        lines = []
35        for (place, marker) in [(self.context, self.context_marker),
36                                (self.problem, self.problem_marker)]:
37            if place is not None:
38                lines.append(place)
39                if marker is not None:
40                    lines.append(str(marker))
41        return '\n'.join(lines)
42
[43]43class SimpleKey:
[51]44    # See below simple keys treatment.
45
[43]46    def __init__(self, token_number, required, index, line, column, marker):
47        self.token_number = token_number
48        self.required = required
49        self.index = index
50        self.line = line
51        self.column = column
52        self.marker = marker
53
[39]54class Scanner:
55
[46]56
57    def __init__(self, reader):
[39]58        """Initialize the scanner."""
[46]59        # The input stream. The Reader class do the dirty work of checking for
[43]60        # BOM and converting the input data to Unicode. It also adds NUL to
61        # the end.
[39]62        #
[46]63        # Reader supports the following methods
[48]64        #   self.reader.peek(i=0)       # peek the next i-th character
65        #   self.reader.prefix(l=1)     # peek the next l characters
66        #   self.reader.forward(l=1)    # read the next l characters
67                                        # and move the pointer
[46]68        self.reader = reader
[39]69
70        # Had we reached the end of the stream?
71        self.done = False
72
73        # The number of unclosed '{' and '['. `flow_level == 0` means block
74        # context.
75        self.flow_level = 0
76
77        # List of processed tokens that are not yet emitted.
78        self.tokens = []
79
80        # Number of tokens that were emitted through the `get_token` method.
81        self.tokens_taken = 0
82
83        # The current indentation level.
84        self.indent = -1
85
86        # Past indentation levels.
87        self.indents = []
88
[43]89        # Variables related to simple keys treatment.
[39]90
91        # A simple key is a key that is not denoted by the '?' indicator.
92        # Example of simple keys:
93        #   ---
94        #   block simple key: value
95        #   ? not a simple key:
96        #   : { flow simple key: value }
97        # We emit the KEY token before all keys, so when we find a potential
98        # simple key, we try to locate the corresponding ':' indicator.
99        # Simple keys should be limited to a single line and 1024 characters.
100
[43]101        # Can a simple key start at the current position? A simple key may
102        # start:
103        # - at the beginning of the line, not counting indentation spaces
104        #       (in block context),
105        # - after '{', '[', ',' (in the flow context),
106        # - after '?', ':', '-' (in the block context).
107        # In the block context, this flag also signify if a block collection
108        # may start at the current position.
109        self.allow_simple_key = True
[39]110
111        # Keep track of possible simple keys. This is a dictionary. The key
112        # is `flow_level`; there can be no more that one possible simple key
[43]113        # for each level. The value is a SimpleKey record:
114        #   (token_number, required, index, line, column, marker)
115        # A simple key may start with ALIAS, ANCHOR, TAG, SCALAR(flow),
116        # '[', or '{' tokens.
[39]117        self.possible_simple_keys = {}
118
[51]119    # Public methods.
[39]120
[51]121    def check(self, *choices):
122        # Check if the next token is one of the given types.
[43]123        while self.need_more_tokens():
[39]124            self.fetch_more_tokens()
125        if self.tokens:
[51]126            for choice in choices:
127                if isinstance(self.tokens[0], choice):
128                    return True
129        return False
130
131    def peek(self):
132        # Return the next token, but do not delete if from the queue.
133        while self.need_more_tokens():
134            self.fetch_more_tokens()
135        if self.tokens:
[39]136            return self.tokens[0]
137
[51]138    def get(self):
139        # Return the next token.
[39]140        while self.need_more_tokens():
141            self.fetch_more_tokens()
142        if self.tokens:
143            self.tokens_taken += 1
144            return self.tokens.pop(0)
145
[51]146    def __iter__(self):
147        # Iterator protocol.
148        while self.need_more_tokens():
149            self.fetch_more_tokens()
150        while self.tokens:
151            self.tokens_taken += 1
152            yield self.tokens.pop(0)
153            while self.need_more_tokens():
154                self.fetch_more_tokens()
155
[43]156    # Private methods.
[39]157
158    def need_more_tokens(self):
159        if self.done:
160            return False
161        if not self.tokens:
162            return True
163        # The current token may be a potential simple key, so we
164        # need to look further.
[43]165        self.stale_possible_simple_keys()
[39]166        if self.next_possible_simple_key() == self.tokens_taken:
167            return True
168
169    def fetch_more_tokens(self):
170
171        # Eat whitespaces and comments until we reach the next token.
[43]172        self.scan_to_next_token()
[39]173
[43]174        # Remove obsolete possible simple keys.
175        self.stale_possible_simple_keys()
176
[39]177        # Compare the current indentation and column. It may add some tokens
[43]178        # and decrease the current indentation level.
[46]179        self.unwind_indent(self.reader.column)
[39]180
181        # Peek the next character.
[46]182        ch = self.reader.peek()
[39]183
[48]184        # Is it the end of stream?
[43]185        if ch == u'\0':
[48]186            return self.fetch_stream_end()
[39]187
188        # Is it a directive?
189        if ch == u'%' and self.check_directive():
190            return self.fetch_directive()
191
192        # Is it the document start?
193        if ch == u'-' and self.check_document_start():
194            return self.fetch_document_start()
195
196        # Is it the document end?
197        if ch == u'.' and self.check_document_end():
198            return self.fetch_document_end()
199
200        # Note: the order of the following checks is NOT significant.
201
202        # Is it the flow sequence start indicator?
203        if ch == u'[':
204            return self.fetch_flow_sequence_start()
205
206        # Is it the flow mapping start indicator?
207        if ch == u'{':
208            return self.fetch_flow_mapping_start()
209
210        # Is it the flow sequence end indicator?
211        if ch == u']':
212            return self.fetch_flow_sequence_end()
213
214        # Is it the flow mapping end indicator?
215        if ch == u'}':
216            return self.fetch_flow_mapping_end()
217
[51]218        # Is it the flow entry indicator?
219        if ch in u',':
220            return self.fetch_flow_entry()
[43]221
[51]222        # Is it the block entry indicator?
223        if ch in u'-' and self.check_block_entry():
224            return self.fetch_block_entry()
225
[39]226        # Is it the key indicator?
227        if ch == u'?' and self.check_key():
228            return self.fetch_key()
229
230        # Is it the value indicator?
231        if ch == u':' and self.check_value():
232            return self.fetch_value()
233
234        # Is it an alias?
235        if ch == u'*':
236            return self.fetch_alias()
237
238        # Is it an anchor?
239        if ch == u'&':
240            return self.fetch_anchor()
241
[43]242        # Is it a tag?
[39]243        if ch == u'!':
244            return self.fetch_tag()
245
[43]246        # Is it a literal scalar?
247        if ch == u'|' and not self.flow_level:
[39]248            return self.fetch_literal()
249
250        # Is it a folded scalar?
[43]251        if ch == u'>' and not self.flow_level:
[39]252            return self.fetch_folded()
253
254        # Is it a single quoted scalar?
255        if ch == u'\'':
256            return self.fetch_single()
257
258        # Is it a double quoted scalar?
259        if ch == u'\"':
260            return self.fetch_double()
261
[43]262        # It must be a plain scalar then.
[39]263        if self.check_plain():
264            return self.fetch_plain()
265
[43]266        # No? It's an error. Let's produce a nice error message.
[48]267        raise ScannerError("while scanning for the next token", None,
268                "found character %r that cannot start any token"
269                % ch.encode('utf-8'), self.reader.get_marker())
[39]270
[43]271    # Simple keys treatment.
272
273    def next_possible_simple_key(self):
274        # Return the number of the nearest possible simple key. Actually we
275        # don't need to loop through the whole dictionary. We may replace it
276        # with the following code:
277        #   if not self.possible_simple_keys:
278        #       return None
279        #   return self.possible_simple_keys[
280        #           min(self.possible_simple_keys.keys())].token_number
281        min_token_number = None
282        for level in self.possible_simple_keys:
283            key = self.possible_simple_keys[level]
284            if min_token_number is None or key.token_number < min_token_number:
285                min_token_number = key.token_number
286        return min_token_number
287
288    def stale_possible_simple_keys(self):
289        # Remove entries that are no longer possible simple keys. According to
290        # the YAML specification, simple keys
291        # - should be limited to a single line,
292        # - should be no longer than 1024 characters.
293        # Disabling this procedure will allow simple keys of any length and
294        # height (may cause problems if indentation is broken though).
295        for level in self.possible_simple_keys.keys():
296            key = self.possible_simple_keys[level]
[46]297            if key.line != self.reader.line  \
298                    or self.reader.index-key.index > 1024:
[43]299                if key.required:
[47]300                    raise ScannerError("while scanning a simple key", key.marker,
301                            "could not found expected ':'", self.reader.get_marker())
[43]302                del self.possible_simple_keys[level]
303
304    def save_possible_simple_key(self):
305        # The next token may start a simple key. We check if it's possible
306        # and save its position. This function is called for
307        #   ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'.
308
309        # Check if a simple key is required at the current position.
[46]310        required = not self.flow_level and self.indent == self.reader.column
[43]311
[47]312        # A simple key is required only if it is the first token in the current
313        # line. Therefore it is always allowed.
314        assert self.allow_simple_key or not required
315
[43]316        # The next token might be a simple key. Let's save it's number and
317        # position.
318        if self.allow_simple_key:
319            self.remove_possible_simple_key()
320            token_number = self.tokens_taken+len(self.tokens)
[46]321            index = self.reader.index
322            line = self.reader.line
323            column = self.reader.column
324            marker = self.reader.get_marker()
[43]325            key = SimpleKey(token_number, required,
326                    index, line, column, marker)
327            self.possible_simple_keys[self.flow_level] = key
328
329    def remove_possible_simple_key(self):
330        # Remove the saved possible key position at the current flow level.
331        if self.flow_level in self.possible_simple_keys:
332            key = self.possible_simple_keys[self.flow_level]
[47]333           
334            # I don't think it's possible, but I could be wrong.
335            assert not key.required
336            #if key.required:
337            #    raise ScannerError("while scanning a simple key", key.marker,
338            #            "could not found expected ':'", self.reader.get_marker())
[43]339
340    # Indentation functions.
341
342    def unwind_indent(self, column):
343
344        # In flow context, tokens should respect indentation.
[47]345        # Actually the condition should be `self.indent >= column` according to
346        # the spec. But this condition will prohibit intuitively correct
347        # constructions such as
348        # key : {
349        # }
[43]350        if self.flow_level and self.indent > column:
[47]351            raise ScannerError(None, None,
352                    "invalid intendation or unclosed '[' or '{'",
353                    self.reader.get_marker())
[43]354
355        # In block context, we may need to issue the BLOCK-END tokens.
356        while self.indent > column:
[46]357            marker = self.reader.get_marker()
[43]358            self.indent = self.indents.pop()
[44]359            self.tokens.append(BlockEndToken(marker, marker))
[43]360
361    def add_indent(self, column):
362        # Check if we need to increase indentation.
363        if self.indent < column:
364            self.indents.append(self.indent)
365            self.indent = column
366            return True
367        return False
368
369    # Fetchers.
370
[48]371    def fetch_stream_end(self):
[39]372
373        # Set the current intendation to -1.
[43]374        self.unwind_indent(-1)
[39]375
376        # Reset everything (not really needed).
[43]377        self.allow_simple_key = False
[39]378        self.possible_simple_keys = {}
379
[43]380        # Read the token.
[46]381        marker = self.reader.get_marker()
[43]382       
383        # Add END.
[47]384        self.tokens.append(StreamEndToken(marker, marker))
[39]385
[46]386        # The reader is ended.
[39]387        self.done = True
388
[43]389    def fetch_directive(self):
390       
391        # Set the current intendation to -1.
392        self.unwind_indent(-1)
[39]393
[43]394        # Reset simple keys.
395        self.remove_possible_simple_key()
396        self.allow_simple_key = False
[39]397
[43]398        # Scan and add DIRECTIVE.
[47]399        self.tokens.append(self.scan_directive())
[39]400
401    def fetch_document_start(self):
[44]402        self.fetch_document_indicator(DocumentStartToken)
[39]403
[43]404    def fetch_document_end(self):
[44]405        self.fetch_document_indicator(DocumentEndToken)
[43]406
407    def fetch_document_indicator(self, TokenClass):
408
[39]409        # Set the current intendation to -1.
[43]410        self.unwind_indent(-1)
[39]411
[43]412        # Reset simple keys. Note that there could not be a block collection
413        # after '---'.
414        self.remove_possible_simple_key()
415        self.allow_simple_key = False
[39]416
[43]417        # Add DOCUMENT-START or DOCUMENT-END.
[46]418        start_marker = self.reader.get_marker()
419        self.reader.forward(3)
420        end_marker = self.reader.get_marker()
[43]421        self.tokens.append(TokenClass(start_marker, end_marker))
[39]422
[43]423    def fetch_flow_sequence_start(self):
[44]424        self.fetch_flow_collection_start(FlowSequenceStartToken)
[39]425
[43]426    def fetch_flow_mapping_start(self):
[44]427        self.fetch_flow_collection_start(FlowMappingStartToken)
[43]428
429    def fetch_flow_collection_start(self, TokenClass):
430
[44]431        # '[' and '{' may start a simple key.
432        self.save_possible_simple_key()
433
[43]434        # Increase the flow level.
435        self.flow_level += 1
436
437        # Simple keys are allowed after '[' and '{'.
438        self.allow_simple_key = True
439
440        # Add FLOW-SEQUENCE-START or FLOW-MAPPING-START.
[46]441        start_marker = self.reader.get_marker()
442        self.reader.forward()
443        end_marker = self.reader.get_marker()
[43]444        self.tokens.append(TokenClass(start_marker, end_marker))
[39]445
[43]446    def fetch_flow_sequence_end(self):
[44]447        self.fetch_flow_collection_end(FlowSequenceEndToken)
[39]448
[43]449    def fetch_flow_mapping_end(self):
[44]450        self.fetch_flow_collection_end(FlowMappingEndToken)
[43]451
452    def fetch_flow_collection_end(self, TokenClass):
453
454        # Reset possible simple key on the current level.
455        self.remove_possible_simple_key()
456
457        # Decrease the flow level.
458        self.flow_level -= 1
459
460        # No simple keys after ']' or '}'.
461        self.allow_simple_key = False
462
463        # Add FLOW-SEQUENCE-END or FLOW-MAPPING-END.
[46]464        start_marker = self.reader.get_marker()
465        self.reader.forward()
466        end_marker = self.reader.get_marker()
[43]467        self.tokens.append(TokenClass(start_marker, end_marker))
[39]468
[51]469    def fetch_flow_entry(self):
[39]470
[51]471        # Simple keys are allowed after ','.
472        self.allow_simple_key = True
473
474        # Reset possible simple key on the current level.
475        self.remove_possible_simple_key()
476
477        # Add FLOW-ENTRY.
478        start_marker = self.reader.get_marker()
479        self.reader.forward()
480        end_marker = self.reader.get_marker()
481        self.tokens.append(FlowEntryToken(start_marker, end_marker))
482
483    def fetch_block_entry(self):
484
[43]485        # Block context needs additional checks.
486        if not self.flow_level:
[39]487
[43]488            # Are we allowed to start a new entry?
489            if not self.allow_simple_key:
[47]490                raise ScannerError(None, None,
491                        "sequence entries are not allowed here",
492                        self.reader.get_marker())
[39]493
[43]494            # We may need to add BLOCK-SEQUENCE-START.
[46]495            if self.add_indent(self.reader.column):
496                marker = self.reader.get_marker()
[44]497                self.tokens.append(BlockSequenceStartToken(marker, marker))
[39]498
[51]499        # It's an error for the block entry to occur in the flow context,
500        # but we let the parser detect this.
501        else:
502            pass
503
504        # Simple keys are allowed after '-'.
[43]505        self.allow_simple_key = True
[39]506
[43]507        # Reset possible simple key on the current level.
508        self.remove_possible_simple_key()
[39]509
[51]510        # Add BLOCK-ENTRY.
[46]511        start_marker = self.reader.get_marker()
512        self.reader.forward()
513        end_marker = self.reader.get_marker()
[51]514        self.tokens.append(BlockEntryToken(start_marker, end_marker))
[39]515
[43]516    def fetch_key(self):
517       
518        # Block context needs additional checks.
519        if not self.flow_level:
[39]520
[43]521            # Are we allowed to start a key (not nessesary a simple)?
522            if not self.allow_simple_key:
[47]523                raise ScannerError(None, None,
524                        "mapping keys are not allowed here",
525                        self.reader.get_marker())
[43]526
527            # We may need to add BLOCK-MAPPING-START.
[46]528            if self.add_indent(self.reader.column):
529                marker = self.reader.get_marker()
[44]530                self.tokens.append(BlockMappingStartToken(marker, marker))
[43]531
532        # Simple keys are allowed after '?' in the block context.
533        self.allow_simple_key = not self.flow_level
534
535        # Reset possible simple key on the current level.
536        self.remove_possible_simple_key()
537
538        # Add KEY.
[46]539        start_marker = self.reader.get_marker()
540        self.reader.forward()
541        end_marker = self.reader.get_marker()
[44]542        self.tokens.append(KeyToken(start_marker, end_marker))
[39]543
[43]544    def fetch_value(self):
[39]545
[43]546        # Do we determine a simple key?
547        if self.flow_level in self.possible_simple_keys:
[39]548
[43]549            # Add KEY.
550            key = self.possible_simple_keys[self.flow_level]
551            del self.possible_simple_keys[self.flow_level]
552            self.tokens.insert(key.token_number-self.tokens_taken,
[44]553                    KeyToken(key.marker, key.marker))
[39]554
[43]555            # If this key starts a new block mapping, we need to add
556            # BLOCK-MAPPING-START.
557            if not self.flow_level:
558                if self.add_indent(key.column):
559                    self.tokens.insert(key.token_number-self.tokens_taken,
[44]560                            BlockMappingStartToken(key.marker, key.marker))
[37]561
[43]562            # There cannot be two simple keys one after another.
563            self.allow_simple_key = False
[37]564
[43]565        # It must be a part of a complex key.
566        else:
567           
[47]568            # Block context needs additional checks.
569            # (Do we really need them? They will be catched by the parser
570            # anyway.)
571            if not self.flow_level:
572
573                # We are allowed to start a complex value if and only if
574                # we can start a simple key.
575                if not self.allow_simple_key:
576                    raise ScannerError(None, None,
577                            "mapping values are not allowed here",
578                            self.reader.get_marker())
579
[43]580            # Simple keys are allowed after ':' in the block context.
581            self.allow_simple_key = not self.flow_level
[37]582
[43]583            # Reset possible simple key on the current level.
584            self.remove_possible_simple_key()
[37]585
[43]586        # Add VALUE.
[46]587        start_marker = self.reader.get_marker()
588        self.reader.forward()
589        end_marker = self.reader.get_marker()
[44]590        self.tokens.append(ValueToken(start_marker, end_marker))
[37]591
[43]592    def fetch_alias(self):
[37]593
[43]594        # ALIAS could be a simple key.
595        self.save_possible_simple_key()
[37]596
[43]597        # No simple keys after ALIAS.
598        self.allow_simple_key = False
[37]599
[43]600        # Scan and add ALIAS.
[47]601        self.tokens.append(self.scan_anchor(AliasToken))
[37]602
[43]603    def fetch_anchor(self):
[37]604
[43]605        # ANCHOR could start a simple key.
606        self.save_possible_simple_key()
[37]607
[43]608        # No simple keys after ANCHOR.
609        self.allow_simple_key = False
[37]610
[43]611        # Scan and add ANCHOR.
[47]612        self.tokens.append(self.scan_anchor(AnchorToken))
[37]613
[43]614    def fetch_tag(self):
[37]615
[43]616        # TAG could start a simple key.
617        self.save_possible_simple_key()
[37]618
[43]619        # No simple keys after TAG.
620        self.allow_simple_key = False
[37]621
[43]622        # Scan and add TAG.
[47]623        self.tokens.append(self.scan_tag())
[37]624
[43]625    def fetch_literal(self):
626        self.fetch_block_scalar(folded=False)
[37]627
[43]628    def fetch_folded(self):
629        self.fetch_block_scalar(folded=True)
[37]630
[43]631    def fetch_block_scalar(self, folded):
[37]632
[43]633        # A simple key may follow a block scalar.
634        self.allow_simple_key = True
[37]635
[43]636        # Reset possible simple key on the current level.
637        self.remove_possible_simple_key()
[37]638
[43]639        # Scan and add SCALAR.
[47]640        self.tokens.append(self.scan_block_scalar(folded))
[37]641
[43]642    def fetch_single(self):
643        self.fetch_flow_scalar(double=False)
[37]644
[43]645    def fetch_double(self):
646        self.fetch_flow_scalar(double=True)
[37]647
[43]648    def fetch_flow_scalar(self, double):
[37]649
[43]650        # A flow scalar could be a simple key.
651        self.save_possible_simple_key()
[37]652
[43]653        # No simple keys after flow scalars.
654        self.allow_simple_key = False
[37]655
[43]656        # Scan and add SCALAR.
[47]657        self.tokens.append(self.scan_flow_scalar(double))
[37]658
[43]659    def fetch_plain(self):
[37]660
[43]661        # A plain scalar could be a simple key.
662        self.save_possible_simple_key()
[37]663
[43]664        # No simple keys after plain scalars. But note that `scan_plain` will
665        # change this flag if the scan is finished at the beginning of the
666        # line.
667        self.allow_simple_key = False
[37]668
[43]669        # Scan and add SCALAR. May change `allow_simple_key`.
[47]670        self.tokens.append(self.scan_plain())
[37]671
[43]672    # Checkers.
[37]673
[43]674    def check_directive(self):
[37]675
[43]676        # DIRECTIVE:        ^ '%' ...
677        # The '%' indicator is already checked.
[46]678        if self.reader.column == 0:
[43]679            return True
[37]680
[43]681    def check_document_start(self):
[37]682
[43]683        # DOCUMENT-START:   ^ '---' (' '|'\n')
[46]684        if self.reader.column == 0:
[48]685            if self.reader.prefix(3) == u'---'  \
686                    and self.reader.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
[43]687                return True
[37]688
[43]689    def check_document_end(self):
[37]690
[43]691        # DOCUMENT-END:     ^ '...' (' '|'\n')
[46]692        if self.reader.column == 0:
693            prefix = self.reader.peek(4)
[48]694            if self.reader.prefix(3) == u'...'  \
695                    and self.reader.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
[43]696                return True
[37]697
[51]698    def check_block_entry(self):
[43]699
[51]700        # BLOCK-ENTRY:      '-' (' '|'\n')
701        return self.reader.peek(1) in u'\0 \t\r\n\x85\u2028\u2029'
[43]702
703    def check_key(self):
704
705        # KEY(flow context):    '?'
706        if self.flow_level:
[37]707            return True
[43]708
709        # KEY(block context):   '?' (' '|'\n')
[37]710        else:
[48]711            return self.reader.peek(1) in u'\0 \t\r\n\x85\u2028\u2029'
[37]712
[43]713    def check_value(self):
714
715        # VALUE(flow context):  ':'
716        if self.flow_level:
[37]717            return True
[43]718
719        # VALUE(block context): ':' (' '|'\n')
[37]720        else:
[48]721            return self.reader.peek(1) in u'\0 \t\r\n\x85\u2028\u2029'
[37]722
[43]723    def check_plain(self):
[37]724
[48]725        # A plain scalar may start with any non-space character except:
726        #   '-', '?', ':', ',', '[', ']', '{', '}',
727        #   '#', '&', '*', '!', '|', '>', '\'', '\"',
728        #   '%', '@', '`'.
729        #
730        # It may also start with
731        #   '-', '?', ':'
732        # if it is followed by a non-space character.
733        #
734        # Note that we limit the last rule to the block context (except the
735        # '-' character) because we want the flow context to be space
736        # independent.
737        ch = self.reader.peek()
738        return ch not in u'\0 \t\r\n\x85\u2028\u2029-?:,[]{}#&*!|>\'\"%@`'  \
739                or (self.reader.peek(1) not in u'\0 \t\r\n\x85\u2028\u2029'
740                        and (ch == '-' or (not self.flow_level and ch in u'?:')))
741
[43]742    # Scanners.
743
744    def scan_to_next_token(self):
[47]745        # We ignore spaces, line breaks and comments.
746        # If we find a line break in the block context, we set the flag
747        # `allow_simple_key` on.
[51]748        # The byte order mark is stripped if it's the first character in the
749        # stream. We do not yet support BOM inside the stream as the
750        # specification requires. Any such mark will be considered as a part
751        # of the document.
752        if self.reader.index == 0 and self.reader.peek() == u'\uFEFF':
753            self.reader.forward()
[43]754        found = False
755        while not found:
[46]756            while self.reader.peek() == u' ':
757                self.reader.forward()
758            if self.reader.peek() == u'#':
[47]759                while self.reader.peek() not in u'\0\r\n\x85\u2028\u2029':
[46]760                    self.reader.forward()
[47]761            if self.scan_line_break():
[43]762                if not self.flow_level:
763                    self.allow_simple_key = True
[37]764            else:
[43]765                found = True
[37]766
[43]767    def scan_directive(self):
[48]768        # See the specification for details.
769        start_marker = self.reader.get_marker()
770        self.reader.forward()
771        name = self.scan_directive_name(start_marker)
772        value = None
773        if name == u'YAML':
774            value = self.scan_yaml_directive_value(start_marker)
775            end_marker = self.reader.get_marker()
776        elif name == u'TAG':
777            value = self.scan_tag_directive_value(start_marker)
778            end_marker = self.reader.get_marker()
[43]779        else:
[48]780            end_marker = self.reader.get_marker()
781            while self.reader.peek() not in u'\0\r\n\x85\u2028\u2029':
782                self.reader.forward()
783        self.scan_directive_ignored_line(start_marker)
784        return DirectiveToken(name, value, start_marker, end_marker)
785
786    def scan_directive_name(self, start_marker):
787        # See the specification for details.
788        length = 0
789        ch = self.reader.peek(length)
790        while u'0' <= ch <= u'9' or u'A' <= ch <= 'Z' or u'a' <= ch <= 'z'  \
791                or ch in u'-_':
792            length += 1
793            ch = self.reader.peek(length)
794        if not length:
795            raise ScannerError("while scanning a directive", start_marker,
796                    "expected directive name, but found %r" % ch.encode('utf-8'),
797                    self.reader.get_marker())
798        value = self.reader.prefix(length)
799        self.reader.forward(length)
800        ch = self.reader.peek()
801        if ch not in u'\0 \r\n\x85\u2028\u2029':
802            raise ScannerError("while scanning a directive" % name, start_marker,
803                    "expected alphabetic or numeric character, but found %r"
804                    % ch.encode('utf-8'), self.reader.get_marker())
805        return value
806
807    def scan_yaml_directive_value(self, start_marker):
808        # See the specification for details.
809        while self.reader.peek() == u' ':
[46]810            self.reader.forward()
[48]811        major = self.scan_yaml_directive_number(start_marker)
812        if self.reader.peek() != '.':
813            raise ScannerError("while scanning a directive", start_marker,
814                    "expected a digit or '.', but found %r" % ch.encode('utf-8'),
815                    self.reader.get_marker())
[46]816        self.reader.forward()
[48]817        minor = self.scan_yaml_directive_number(start_marker)
818        if self.reader.peek() not in u'\0 \r\n\x85\u2028\u2029':
819            raise ScannerError("while scanning a directive", start_marker,
820                    "expected a digit or ' ', but found %r" % ch.encode('utf-8'),
821                    self.reader.get_marker())
822        return (major, minor)
[37]823
[48]824    def scan_yaml_directive_number(self, start_marker):
825        # See the specification for details.
826        ch = self.reader.peek()
827        if not (u'0' <= ch <= '9'):
828            raise ScannerError("while scanning a directive", start_marker,
829                    "expected a digit, but found %r" % ch.encode('utf-8'),
830                    self.reader.get_marker())
831        length = 0
832        while u'0' <= self.reader.peek(length) <= u'9':
833            length += 1
834        value = int(self.reader.prefix(length))
835        self.reader.forward(length)
836        return value
837
838    def scan_tag_directive_value(self, start_marker):
839        # See the specification for details.
840        while self.reader.peek() == u' ':
841            self.reader.forward()
842        handle = self.scan_tag_directive_handle(start_marker)
843        while self.reader.peek() == u' ':
844            self.reader.forward()
845        prefix = self.scan_tag_directive_prefix(start_marker)
846        return (handle, prefix)
847
848    def scan_tag_directive_handle(self, start_marker):
849        # See the specification for details.
850        value = self.scan_tag_handle('directive', start_marker)
851        if self.reader.peek() != u' ':
852            raise ScannerError("while scanning a directive", start_marker,
853                    "expected ' ', but found %r" % ch.encode('utf-8'),
854                    self.reader.get_marker())
855        return value
856
857    def scan_tag_directive_prefix(self, start_marker):
858        # See the specification for details.
859        value = self.scan_tag_uri('directive', start_marker)
860        ch = self.reader.peek()
861        if ch not in u'\0 \r\n\x85\u2028\u2029':
862            raise ScannerError("while scanning a directive", start_marker,
863                    "expected ' ', but found %r" % ch.encode('utf-8'),
864                    self.reader.get_marker())
865        return value
866
867    def scan_directive_ignored_line(self, start_marker):
868        # See the specification for details.
869        while self.reader.peek() == u' ':
870            self.reader.forward()
871        if self.reader.peek() == u'#':
872            while self.reader.peek() not in u'\0\r\n\x85\u2028\u2029':
873                self.reader.forward()
874        ch = self.reader.peek()
875        if ch not in u'\0\r\n\x85\u2028\u2029':
876            raise ScannerError("while scanning a directive", start_marker,
877                    "expected a comment or a line break, but found %r"
878                        % ch.encode('utf-8'), self.reader.get_marker())
879        self.scan_line_break()
880
[43]881    def scan_anchor(self, TokenClass):
[48]882        # The specification does not restrict characters for anchors and
883        # aliases. This may lead to problems, for instance, the document:
884        #   [ *alias, value ]
885        # can be interpteted in two ways, as
886        #   [ "value" ]
887        # and
888        #   [ *alias , "value" ]
889        # Therefore we restrict aliases to numbers and ASCII letters.
[46]890        start_marker = self.reader.get_marker()
[48]891        indicator = self.reader.peek()
892        if indicator == '*':
893            name = 'alias'
894        else:
895            name = 'anchor'
896        self.reader.forward()
897        length = 0
898        ch = self.reader.peek(length)
899        while u'0' <= ch <= u'9' or u'A' <= ch <= 'Z' or u'a' <= ch <= 'z'  \
900                or ch in u'-_':
901            length += 1
902            ch = self.reader.peek(length)
903        if not length:
904            raise ScannerError("while scanning an %s" % name, start_marker,
905                    "expected anchor name, but found %r" % ch.encode('utf-8'),
906                    self.reader.get_marker())
907        value = self.reader.prefix(length)
908        self.reader.forward(length)
909        ch = self.reader.peek()
910        if ch not in u'\0 \t\r\n\x85\u2028\u2029?:,]}%@`':
911            raise ScannerError("while scanning an %s" % name, start_marker,
912                    "expected alphabetic or numeric character, but found %r"
913                    % ch.encode('utf-8'), self.reader.get_marker())
[46]914        end_marker = self.reader.get_marker()
[48]915        return TokenClass(value, start_marker, end_marker)
[37]916
[43]917    def scan_tag(self):
[48]918        # See the specification for details.
[46]919        start_marker = self.reader.get_marker()
[48]920        ch = self.reader.peek(1)
921        if ch == u'<':
922            handle = None
923            self.reader.forward(2)
924            suffix = self.scan_tag_uri('tag', start_marker)
925            if self.reader.peek() != u'>':
926                raise ScannerError("while parsing a tag", start_marking,
927                        "expected '>', but got %r" % self.reader.peek().encode('utf-8'),
928                        self.reader.get_marker())
[46]929            self.reader.forward()
[48]930        elif ch in u'\0 \t\r\n\x85\u2028\u2029':
931            handle = None
932            suffix = u'!'
933            self.reader.forward()
934        else:
935            length = 1
936            use_handle = False
937            while ch not in u'\0 \r\n\x85\u2028\u2029':
938                if ch == u'!':
939                    use_handle = True
940                    break
941                length += 1
942                ch = self.reader.peek(length)
943            handle = u'!'
944            if use_handle:
945                handle = self.scan_tag_handle('tag', start_marker)
946            else:
947                handle = u'!'
948                self.reader.forward()
949            suffix = self.scan_tag_uri('tag', start_marker)
950        ch = self.reader.peek()
951        if ch not in u'\0 \r\n\x85\u2028\u2029':
952            raise ScannerError("while scanning a tag", start_marker,
953                    "expected ' ', but found %r" % ch.encode('utf-8'),
954                    self.reader.get_marker())
955        value = (handle, suffix)
[46]956        end_marker = self.reader.get_marker()
[48]957        return TagToken(value, start_marker, end_marker)
[43]958
959    def scan_block_scalar(self, folded):
[48]960        # See the specification for details.
961
962        chunks = []
[46]963        start_marker = self.reader.get_marker()
[48]964
965        # Scan the header.
966        self.reader.forward()
967        chomping, increment = self.scan_block_scalar_indicators(start_marker)
968        self.scan_block_scalar_ignored_line(start_marker)
969
970        # Determine the indentation level and go to the first non-empty line.
971        min_indent = self.indent+1
972        if min_indent < 1:
973            min_indent = 1
974        if increment is None:
975            breaks, max_indent, end_marker = self.scan_block_scalar_indentation()
976            indent = max(min_indent, max_indent)
977        else:
978            indent = min_indent+increment-1
979            breaks, end_marker = self.scan_block_scalar_breaks(indent)
980        line_break = u''
981
982        # Scan the inner part of the block scalar.
983        while self.reader.column == indent and self.reader.peek() != u'\0':
984            chunks.extend(breaks)
985            leading_non_space = self.reader.peek() not in u' \t'
986            length = 0
987            while self.reader.peek(length) not in u'\0\r\n\x85\u2028\u2029':
988                length += 1
989            chunks.append(self.reader.prefix(length))
990            self.reader.forward(length)
991            line_break = self.scan_line_break()
992            breaks, end_marker = self.scan_block_scalar_breaks(indent)
993            if self.reader.column == indent and self.reader.peek() != u'\0':
994                # Unfortunately, folding rules are ambiguous.
995                #
996                # This is the folding according to the specification:
[51]997               
998                if folded and line_break == u'\n'   \
999                        and leading_non_space and self.reader.peek() not in u' \t':
1000                    if not breaks:
1001                        chunks.append(u' ')
1002                else:
1003                    chunks.append(line_break)
1004               
1005                # This is Clark Evans's interpretation (also in the spec
1006                # examples):
[48]1007                #
[51]1008                #if folded and line_break == u'\n':
[48]1009                #    if not breaks:
[51]1010                #        if self.reader.peek() not in ' \t':
1011                #            chunks.append(u' ')
1012                #        else:
1013                #            chunks.append(line_break)
[48]1014                #else:
1015                #    chunks.append(line_break)
1016            else:
1017                break
1018
1019        # Chomp the tail.
1020        if chomping is not False:
1021            chunks.append(line_break)
1022        if chomping is True:
1023            chunks.extend(breaks)
1024
1025        # We are done.
1026        return ScalarToken(u''.join(chunks), False, start_marker, end_marker)
1027
1028    def scan_block_scalar_indicators(self, start_marker):
1029        # See the specification for details.
1030        chomping = None
1031        increment = None
1032        ch = self.reader.peek()
1033        if ch in u'+-':
1034            if ch == '+':
1035                chomping = True
1036            else:
1037                chomping = False
1038            self.reader.forward()
1039            ch = self.reader.peek()
1040            if ch in u'0123456789':
1041                increment = int(ch)
1042                if increment == 0:
1043                    raise ScannerError("while scanning a block scalar", start_marker,
1044                            "expected indentation indicator in the range 1-9, but found 0",
1045                            self.reader.get_marker())
[46]1046                self.reader.forward()
[48]1047        elif ch in u'0123456789':
1048            increment = int(ch)
1049            if increment == 0:
1050                raise ScannerError("while scanning a block scalar", start_marker,
1051                        "expected indentation indicator in the range 1-9, but found 0",
1052                        self.reader.get_marker())
1053            self.reader.forward()
1054            ch = self.reader.peek()
1055            if ch in u'+-':
1056                if ch == '+':
1057                    chomping = True
1058                else:
1059                    chomping = False
[46]1060                self.reader.forward()
[48]1061        ch = self.reader.peek()
1062        if ch not in u'\0 \r\n\x85\u2028\u2029':
1063            raise ScannerError("while scanning a block scalar", start_marker,
1064                    "expected chomping or indentation indicators, but found %r"
1065                        % ch.encode('utf-8'), self.reader.get_marker())
1066        return chomping, increment
1067
1068    def scan_block_scalar_ignored_line(self, start_marker):
1069        # See the specification for details.
1070        while self.reader.peek() == u' ':
1071            self.reader.forward()
1072        if self.reader.peek() == u'#':
1073            while self.reader.peek() not in u'\0\r\n\x85\u2028\u2029':
[46]1074                self.reader.forward()
[48]1075        ch = self.reader.peek()
1076        if ch not in u'\0\r\n\x85\u2028\u2029':
1077            raise ScannerError("while scanning a block scalar", start_marker,
1078                    "expected a comment or a line break, but found %r"
1079                        % ch.encode('utf-8'), self.reader.get_marker())
1080        self.scan_line_break()
[43]1081
[48]1082    def scan_block_scalar_indentation(self):
1083        # See the specification for details.
1084        chunks = []
1085        max_indent = 0
1086        end_marker = self.reader.get_marker()
1087        while self.reader.peek() in u' \r\n\x85\u2028\u2029':
1088            if self.reader.peek() != u' ':
1089                chunks.append(self.scan_line_break())
1090                end_marker = self.reader.get_marker()
1091            else:
1092                self.reader.forward()
1093                if self.reader.column > max_indent:
1094                    max_indent = self.reader.column
1095        return chunks, max_indent, end_marker
1096
1097    def scan_block_scalar_breaks(self, indent):
1098        # See the specification for details.
1099        chunks = []
1100        end_marker = self.reader.get_marker()
1101        while self.reader.column < indent and self.reader.peek() == u' ':
1102            self.reader.forward()
1103        while self.reader.peek() in u'\r\n\x85\u2028\u2029':
1104            chunks.append(self.scan_line_break())
1105            end_marker = self.reader.get_marker()
1106            while self.reader.column < indent and self.reader.peek() == u' ':
1107                self.reader.forward()
1108        return chunks, end_marker
1109
[43]1110    def scan_flow_scalar(self, double):
[48]1111        # See the specification for details.
1112        chunks = []
1113        start_marker = self.reader.get_marker()
1114        indent = self.indent+1
1115        if indent == 0:
1116            indent = 1
[46]1117        quote = self.reader.peek()
1118        self.reader.forward()
[48]1119        chunks.extend(self.scan_flow_scalar_non_spaces(double, indent, start_marker))
[46]1120        while self.reader.peek() != quote:
[48]1121            chunks.extend(self.scan_flow_scalar_spaces(double, indent, start_marker))
1122            chunks.extend(self.scan_flow_scalar_non_spaces(double, indent, start_marker))
1123        self.reader.forward()
1124        end_marker = self.reader.get_marker()
1125        return ScalarToken(u''.join(chunks), False, start_marker, end_marker)
1126
1127    ESCAPE_REPLACEMENTS = {
1128        u'0':   u'\0',
1129        u'a':   u'\x07',
1130        u'b':   u'\x08',
1131        u't':   u'\x09',
1132        u'\t':  u'\x09',
1133        u'n':   u'\x0A',
1134        u'v':   u'\x0B',
1135        u'f':   u'\x0C',
1136        u'r':   u'\x0D',
1137        u'e':   u'\x1B',
1138        u' ':   u'\x20',
1139        u'\"':  u'\"',
1140        u'\\':  u'\\',
1141        u'N':   u'\x85',
1142        u'_':   u'\xA0',
1143        u'L':   u'\u2028',
1144        u'P':   u'\u2029',
1145    }
1146
1147    ESCAPE_CODES = {
1148        u'x':   2,
1149        u'u':   4,
1150        u'U':   8,
1151    }
1152
1153    def scan_flow_scalar_non_spaces(self, double, indent, start_marker):
1154        # See the specification for details.
1155        chunks = []
1156        while True:
1157            length = 0
1158            while self.reader.peek(length) not in u'\'\"\\\0 \t\r\n\x85\u2028\u2029':
1159                length += 1
1160            if length:
1161                chunks.append(self.reader.prefix(length))
1162                self.reader.forward(length)
1163            ch = self.reader.peek()
1164            if not double and ch == u'\'' and self.reader.peek(1) == u'\'':
1165                chunks.append(u'\'')
[46]1166                self.reader.forward(2)
[48]1167            elif (double and ch == u'\'') or (not double and ch in u'\"\\'):
1168                chunks.append(ch)
1169                self.reader.forward()
1170            elif double and ch == u'\\':
1171                self.reader.forward()
1172                ch = self.reader.peek()
1173                if ch in self.ESCAPE_REPLACEMENTS:
1174                    chunks.append(self.ESCAPE_REPLACEMENTS[ch])
1175                    self.reader.forward()
1176                elif ch in self.ESCAPE_CODES:
1177                    length = self.ESCAPE_CODES[ch]
1178                    self.reader.forward()
1179                    for k in range(length):
1180                        if self.reader.peek(k) not in u'0123456789ABCDEFabcdef':
1181                            raise ScannerError("while scanning a double-quoted scalar", start_marker,
1182                                    "expected escape sequence of %d hexdecimal numbers, but found %r" %
1183                                        (length, self.reader.peek(k).encode('utf-8')), self.reader.get_marker())
1184                    code = int(self.reader.prefix(length), 16)
1185                    chunks.append(unichr(code))
1186                    self.reader.forward(length)
1187                elif ch in u'\r\n\x85\u2028\u2029':
1188                    self.scan_line_break()
1189                    chunks.extend(self.scan_flow_scalar_breaks(double, indent, start_marker))
1190                else:
1191                    raise ScannerError("while scanning a double-quoted scalar", start_marker,
1192                            "found unknown escape character %r" % ch.encode('utf-8'), self.reader.get_marker())
[37]1193            else:
[48]1194                return chunks
[37]1195
[48]1196    def scan_flow_scalar_spaces(self, double, indent, start_marker):
1197        # See the specification for details.
1198        chunks = []
1199        length = 0
1200        while self.reader.peek(length) in u' \t':
1201            length += 1
1202        whitespaces = self.reader.prefix(length)
1203        self.reader.forward(length)
1204        ch = self.reader.peek()
1205        if ch == u'\0':
1206            raise ScannerError("while scanning a quoted scalar", start_marker,
1207                    "found unexpected end of stream", self.reader.get_marker())
1208        elif ch in u'\r\n\x85\u2028\u2029':
1209            line_break = self.scan_line_break()
1210            breaks = self.scan_flow_scalar_breaks(double, indent, start_marker)
1211            if line_break != u'\n':
1212                chunks.append(line_break)
1213            elif not breaks:
1214                chunks.append(u' ')
1215            chunks.extend(breaks)
1216        else:
1217            chunks.append(whitespaces)
1218        return chunks
1219
1220    def scan_flow_scalar_breaks(self, double, indent, start_marker):
1221        # See the specification for details.
1222        chunks = []
1223        while True:
1224            while self.reader.column < indent and self.reader.peek() == u' ':
1225                self.reader.forward()
1226            if self.reader.column < indent  \
1227                    and self.reader.peek() not in u'\0\r\n\x85\u2028\u2029':
1228                s = 's'
1229                if indent == 1:
1230                    s = ''
1231                raise ScannerError("while scanning a quoted scalar", start_marker,
1232                        "expected %d space%s indentation, but found %r"
1233                        % (indent, s, self.reader.peek().encode('utf-8')),
1234                        self.reader.get_marker())
1235            while self.reader.peek() in u' \t':
1236                self.reader.forward()
1237            if self.reader.peek() in u'\r\n\x85\u2028\u2029':
1238                chunks.append(self.scan_line_break())
1239            else:
1240                return chunks
1241
[43]1242    def scan_plain(self):
[48]1243        # See the specification for details.
1244        # We add an additional restriction for the flow context:
1245        #   plain scalars in the flow context cannot contain ':' and '?'.
1246        # We also keep track of the `allow_simple_key` flag here.
1247        chunks = []
1248        start_marker = self.reader.get_marker()
1249        end_marker = start_marker
[43]1250        indent = self.indent+1
[48]1251        if indent == 0:
[43]1252            indent = 1
[48]1253        spaces = []
[43]1254        while True:
[48]1255            length = 0
1256            if self.reader.peek() == u'#':
[43]1257                break
[48]1258            while True:
1259                ch = self.reader.peek(length)
1260                if ch in u'\0 \t\r\n\x85\u2028\u2029'   \
1261                        or (not self.flow_level and ch == u':' and
1262                                self.reader.peek(length+1) in u'\0 \t\r\n\x28\u2028\u2029') \
1263                        or (self.flow_level and ch in u',:?[]{}'):
1264                    break
1265                length += 1
1266            if length == 0:
[43]1267                break
[48]1268            self.allow_simple_key = False
1269            chunks.extend(spaces)
1270            chunks.append(self.reader.prefix(length))
1271            self.reader.forward(length)
1272            end_marker = self.reader.get_marker()
1273            spaces = self.scan_plain_spaces(indent)
1274            if not spaces or self.reader.peek() == u'#' \
1275                    or self.reader.column < indent:
1276                break
1277        return ScalarToken(u''.join(chunks), True, start_marker, end_marker)
[37]1278
[48]1279    def scan_plain_spaces(self, indent):
1280        # See the specification for details.
1281        # The specification is really confusing about tabs in plain scalars.
1282        # We just forbid them completely. Do not use tabs in YAML!
1283        chunks = []
1284        length = 0
1285        while self.reader.peek(length) in u' ':
1286            length += 1
1287        whitespaces = self.reader.prefix(length)
1288        self.reader.forward(length)
1289        ch = self.reader.peek()
1290        if ch in u'\r\n\x85\u2028\u2029':
1291            line_break = self.scan_line_break()
1292            self.allow_simple_key = True
1293            breaks = []
1294            while self.reader.peek() in u' \r\n\x85\u2028\u2029':
1295                if self.reader.peek() == ' ':
1296                    self.reader.forward()
1297                else:
1298                    breaks.append(self.scan_line_break())
1299            if line_break != u'\n':
1300                chunks.append(line_break)
1301            elif not breaks:
1302                chunks.append(u' ')
1303            chunks.extend(breaks)
1304        elif whitespaces:
1305            chunks.append(whitespaces)
1306        return chunks
1307
1308    def scan_tag_handle(self, name, start_marker):
1309        # See the specification for details.
1310        # For some strange reasons, the specification does not allow '_' in
1311        # tag handles. I have allowed it anyway.
1312        if self.reader.peek() != u'!':
1313            raise ScannerError("while scanning a %s" % name, start_marker,
1314                    "expected '!', but found %r" % ch.encode('utf-8'),
1315                    self.reader.get_marker())
1316        length = 1
1317        ch = self.reader.peek(length)
1318        if ch != u' ':
1319            while u'0' <= ch <= u'9' or u'A' <= ch <= 'Z' or u'a' <= ch <= 'z'  \
1320                    or ch in u'-_':
1321                length += 1
1322                ch = self.reader.peek(length)
1323            if ch != u'!':
1324                self.reader.forward(length)
1325                raise ScannerError("while scanning a %s" % name, start_marker,
1326                        "expected '!', but found %r" % ch.encode('utf-8'),
1327                        self.reader.get_marker())
1328            length += 1
1329        value = self.reader.prefix(length)
1330        self.reader.forward(length)
1331        return value
1332
1333    def scan_tag_uri(self, name, start_marker):
1334        # See the specification for details.
1335        # Note: we do not check if URI is well-formed.
1336        chunks = []
1337        length = 0
1338        ch = self.reader.peek(length)
1339        while u'0' <= ch <= u'9' or u'A' <= ch <= 'Z' or u'a' <= ch <= 'z'  \
1340                or ch in u'-;/?:@&=+$,_.!~*\'()[]%':
1341            if ch == u'%':
1342                chunks.append(self.reader.prefix(length))
1343                self.reader.forward(length)
1344                length = 0
1345                chunks.append(self.scan_uri_escapes(name, start_marker))
1346            else:
1347                length += 1
1348            ch = self.reader.peek(length)
1349        if length:
1350            chunks.append(self.reader.prefix(length))
1351            self.reader.forward(length)
1352            length = 0
1353        if not chunks:
1354            raise ScannerError("while parsing a %s" % name, start_marker,
1355                    "expected URI, but found %r" % ch.encode('utf-8'),
1356                    self.reader.get_marker())
1357        return u''.join(chunks)
1358
1359    def scan_uri_escapes(self, name, start_marker):
1360        # See the specification for details.
1361        bytes = []
1362        marker = self.reader.get_marker()
1363        while self.reader.peek() == u'%':
1364            self.reader.forward()
1365            for k in range(2):
1366                if self.reader.peek(k) not in u'0123456789ABCDEFabcdef':
1367                    raise ScannerError("while scanning a %s" % name, start_marker,
1368                            "expected URI escape sequence of 2 hexdecimal numbers, but found %r" %
1369                                (self.reader.peek(k).encode('utf-8')), self.reader.get_marker())
1370            bytes.append(chr(int(self.reader.prefix(2), 16)))
1371            self.reader.forward(2)
1372        try:
1373            value = unicode(''.join(bytes), 'utf-8')
1374        except UnicodeDecodeError, exc:
1375            raise ScannerError("while scanning a %s" % name, start_marker, str(exc), marker)
1376        return value
1377
[47]1378    def scan_line_break(self):
1379        # Transforms:
1380        #   '\r\n'      :   '\n'
1381        #   '\r'        :   '\n'
1382        #   '\n'        :   '\n'
1383        #   '\x85'      :   '\n'
1384        #   '\u2028'    :   '\u2028'
1385        #   '\u2029     :   '\u2029'
1386        #   default     :   ''
1387        ch = self.reader.peek()
1388        if ch in u'\r\n\x85':
[48]1389            if self.reader.prefix(2) == u'\r\n':
[47]1390                self.forward(2)
1391            else:
1392                self.reader.forward()
1393            return u'\n'
1394        elif ch in u'\u2028\u2029':
1395            self.reader.forward()
1396            return ch
1397        return u''
1398
[45]1399#try:
1400#    import psyco
1401#    psyco.bind(Scanner)
1402#except ImportError:
1403#    pass
1404
Note: See TracBrowser for help on using the repository browser.