source: branches/pyyaml3000/lib/yaml/scanner.py @ 48

Revision 48, 51.2 KB checked in by xi, 8 years ago (diff)

Scanner is complete.

RevLine 
[39]1
[43]2# Tokens:
3# YAML-DIRECTIVE(major_version, minor_version), TAG-DIRECTIVE(handle, prefix)
4# RESERVED-DIRECTIVE(name)
5# DOCUMENT-START, DOCUMENT-END
6# BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END
7# FLOW-SEQUENCE-START, FLOW-MAPPING-START, FLOW-SEQUENCE-END, FLOW-MAPPING-END
8# ENTRY, KEY, VALUE
9# ALIAS(name), ANCHOR(name), TAG(value), SCALAR(value, plain)
10
[46]11__all__ = ['Scanner', 'ScannerError']
[43]12
[46]13from error import YAMLError
14from tokens import *
[39]15
[46]16class ScannerError(YAMLError):
17    # TODO:
18    # ScannerError: while reading a quoted string
19    #         in '...', line 5, column 10:
20    # key: "valu\?e"
21    #      ^
22    # got unknown quote character '?'
23    #         in '...', line 5, column 15:
24    # key: "valu\?e"
25    #            ^
[47]26    def __init__(self, context=None, context_marker=None,
[48]27            problem=None, problem_marker=None):
[47]28        self.context = context
29        self.context_marker = context_marker
30        self.problem = problem
31        self.problem_marker = problem_marker
[43]32
[47]33    def __str__(self):
34        lines = []
35        for (place, marker) in [(self.context, self.context_marker),
36                                (self.problem, self.problem_marker)]:
37            if place is not None:
38                lines.append(place)
39                if marker is not None:
40                    lines.append(str(marker))
41        return '\n'.join(lines)
42
[43]43class SimpleKey:
44    def __init__(self, token_number, required, index, line, column, marker):
45        self.token_number = token_number
46        self.required = required
47        self.index = index
48        self.line = line
49        self.column = column
50        self.marker = marker
51
[39]52class Scanner:
53
[46]54
55    def __init__(self, reader):
[39]56        """Initialize the scanner."""
[46]57        # The input stream. The Reader class do the dirty work of checking for
[43]58        # BOM and converting the input data to Unicode. It also adds NUL to
59        # the end.
[39]60        #
[46]61        # Reader supports the following methods
[48]62        #   self.reader.peek(i=0)       # peek the next i-th character
63        #   self.reader.prefix(l=1)     # peek the next l characters
64        #   self.reader.forward(l=1)    # read the next l characters
65                                        # and move the pointer
[46]66        self.reader = reader
[39]67
68        # Had we reached the end of the stream?
69        self.done = False
70
71        # The number of unclosed '{' and '['. `flow_level == 0` means block
72        # context.
73        self.flow_level = 0
74
75        # List of processed tokens that are not yet emitted.
76        self.tokens = []
77
78        # Number of tokens that were emitted through the `get_token` method.
79        self.tokens_taken = 0
80
81        # The current indentation level.
82        self.indent = -1
83
84        # Past indentation levels.
85        self.indents = []
86
[43]87        # Variables related to simple keys treatment.
[39]88
89        # A simple key is a key that is not denoted by the '?' indicator.
90        # Example of simple keys:
91        #   ---
92        #   block simple key: value
93        #   ? not a simple key:
94        #   : { flow simple key: value }
95        # We emit the KEY token before all keys, so when we find a potential
96        # simple key, we try to locate the corresponding ':' indicator.
97        # Simple keys should be limited to a single line and 1024 characters.
98
[43]99        # Can a simple key start at the current position? A simple key may
100        # start:
101        # - at the beginning of the line, not counting indentation spaces
102        #       (in block context),
103        # - after '{', '[', ',' (in the flow context),
104        # - after '?', ':', '-' (in the block context).
105        # In the block context, this flag also signify if a block collection
106        # may start at the current position.
107        self.allow_simple_key = True
[39]108
109        # Keep track of possible simple keys. This is a dictionary. The key
110        # is `flow_level`; there can be no more that one possible simple key
[43]111        # for each level. The value is a SimpleKey record:
112        #   (token_number, required, index, line, column, marker)
113        # A simple key may start with ALIAS, ANCHOR, TAG, SCALAR(flow),
114        # '[', or '{' tokens.
[39]115        self.possible_simple_keys = {}
116
[43]117    # Two public methods.
[39]118
119    def peek_token(self):
120        """Get the current token."""
[43]121        while self.need_more_tokens():
[39]122            self.fetch_more_tokens()
123        if self.tokens:
124            return self.tokens[0]
125
126    def get_token(self):
[43]127        "Get the current token and remove it from the list of pending tokens."""
[39]128        while self.need_more_tokens():
129            self.fetch_more_tokens()
130        if self.tokens:
131            self.tokens_taken += 1
132            return self.tokens.pop(0)
133
[43]134    # Private methods.
[39]135
136    def need_more_tokens(self):
137        if self.done:
138            return False
139        if not self.tokens:
140            return True
141        # The current token may be a potential simple key, so we
142        # need to look further.
[43]143        self.stale_possible_simple_keys()
[39]144        if self.next_possible_simple_key() == self.tokens_taken:
145            return True
146
147    def fetch_more_tokens(self):
148
149        # Eat whitespaces and comments until we reach the next token.
[43]150        self.scan_to_next_token()
[39]151
[43]152        # Remove obsolete possible simple keys.
153        self.stale_possible_simple_keys()
154
[39]155        # Compare the current indentation and column. It may add some tokens
[43]156        # and decrease the current indentation level.
[46]157        self.unwind_indent(self.reader.column)
[39]158
159        # Peek the next character.
[46]160        ch = self.reader.peek()
[39]161
[48]162        # Is it the end of stream?
[43]163        if ch == u'\0':
[48]164            return self.fetch_stream_end()
[39]165
[48]166        # Is it the byte order mark?
167        if ch == u'\uFEFF':
168            return self.fetch_bom()
169
[39]170        # Is it a directive?
171        if ch == u'%' and self.check_directive():
172            return self.fetch_directive()
173
174        # Is it the document start?
175        if ch == u'-' and self.check_document_start():
176            return self.fetch_document_start()
177
178        # Is it the document end?
179        if ch == u'.' and self.check_document_end():
180            return self.fetch_document_end()
181
182        # Note: the order of the following checks is NOT significant.
183
184        # Is it the flow sequence start indicator?
185        if ch == u'[':
186            return self.fetch_flow_sequence_start()
187
188        # Is it the flow mapping start indicator?
189        if ch == u'{':
190            return self.fetch_flow_mapping_start()
191
192        # Is it the flow sequence end indicator?
193        if ch == u']':
194            return self.fetch_flow_sequence_end()
195
196        # Is it the flow mapping end indicator?
197        if ch == u'}':
198            return self.fetch_flow_mapping_end()
199
[43]200        # Is it the entry indicator?
201        if ch in u'-,' and self.check_entry():
202            return self.fetch_entry()
203
[39]204        # Is it the key indicator?
205        if ch == u'?' and self.check_key():
206            return self.fetch_key()
207
208        # Is it the value indicator?
209        if ch == u':' and self.check_value():
210            return self.fetch_value()
211
212        # Is it an alias?
213        if ch == u'*':
214            return self.fetch_alias()
215
216        # Is it an anchor?
217        if ch == u'&':
218            return self.fetch_anchor()
219
[43]220        # Is it a tag?
[39]221        if ch == u'!':
222            return self.fetch_tag()
223
[43]224        # Is it a literal scalar?
225        if ch == u'|' and not self.flow_level:
[39]226            return self.fetch_literal()
227
228        # Is it a folded scalar?
[43]229        if ch == u'>' and not self.flow_level:
[39]230            return self.fetch_folded()
231
232        # Is it a single quoted scalar?
233        if ch == u'\'':
234            return self.fetch_single()
235
236        # Is it a double quoted scalar?
237        if ch == u'\"':
238            return self.fetch_double()
239
[43]240        # It must be a plain scalar then.
[39]241        if self.check_plain():
242            return self.fetch_plain()
243
[43]244        # No? It's an error. Let's produce a nice error message.
[48]245        raise ScannerError("while scanning for the next token", None,
246                "found character %r that cannot start any token"
247                % ch.encode('utf-8'), self.reader.get_marker())
[39]248
[43]249    # Simple keys treatment.
250
251    def next_possible_simple_key(self):
252        # Return the number of the nearest possible simple key. Actually we
253        # don't need to loop through the whole dictionary. We may replace it
254        # with the following code:
255        #   if not self.possible_simple_keys:
256        #       return None
257        #   return self.possible_simple_keys[
258        #           min(self.possible_simple_keys.keys())].token_number
259        min_token_number = None
260        for level in self.possible_simple_keys:
261            key = self.possible_simple_keys[level]
262            if min_token_number is None or key.token_number < min_token_number:
263                min_token_number = key.token_number
264        return min_token_number
265
266    def stale_possible_simple_keys(self):
267        # Remove entries that are no longer possible simple keys. According to
268        # the YAML specification, simple keys
269        # - should be limited to a single line,
270        # - should be no longer than 1024 characters.
271        # Disabling this procedure will allow simple keys of any length and
272        # height (may cause problems if indentation is broken though).
273        for level in self.possible_simple_keys.keys():
274            key = self.possible_simple_keys[level]
[46]275            if key.line != self.reader.line  \
276                    or self.reader.index-key.index > 1024:
[43]277                if key.required:
[47]278                    raise ScannerError("while scanning a simple key", key.marker,
279                            "could not found expected ':'", self.reader.get_marker())
[43]280                del self.possible_simple_keys[level]
281
282    def save_possible_simple_key(self):
283        # The next token may start a simple key. We check if it's possible
284        # and save its position. This function is called for
285        #   ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'.
286
287        # Check if a simple key is required at the current position.
[46]288        required = not self.flow_level and self.indent == self.reader.column
[43]289
[47]290        # A simple key is required only if it is the first token in the current
291        # line. Therefore it is always allowed.
292        assert self.allow_simple_key or not required
293
[43]294        # The next token might be a simple key. Let's save it's number and
295        # position.
296        if self.allow_simple_key:
297            self.remove_possible_simple_key()
298            token_number = self.tokens_taken+len(self.tokens)
[46]299            index = self.reader.index
300            line = self.reader.line
301            column = self.reader.column
302            marker = self.reader.get_marker()
[43]303            key = SimpleKey(token_number, required,
304                    index, line, column, marker)
305            self.possible_simple_keys[self.flow_level] = key
306
307    def remove_possible_simple_key(self):
308        # Remove the saved possible key position at the current flow level.
309        if self.flow_level in self.possible_simple_keys:
310            key = self.possible_simple_keys[self.flow_level]
[47]311           
312            # I don't think it's possible, but I could be wrong.
313            assert not key.required
314            #if key.required:
315            #    raise ScannerError("while scanning a simple key", key.marker,
316            #            "could not found expected ':'", self.reader.get_marker())
[43]317
318    # Indentation functions.
319
320    def unwind_indent(self, column):
321
322        # In flow context, tokens should respect indentation.
[47]323        # Actually the condition should be `self.indent >= column` according to
324        # the spec. But this condition will prohibit intuitively correct
325        # constructions such as
326        # key : {
327        # }
[43]328        if self.flow_level and self.indent > column:
[47]329            raise ScannerError(None, None,
330                    "invalid intendation or unclosed '[' or '{'",
331                    self.reader.get_marker())
[43]332
333        # In block context, we may need to issue the BLOCK-END tokens.
334        while self.indent > column:
[46]335            marker = self.reader.get_marker()
[43]336            self.indent = self.indents.pop()
[44]337            self.tokens.append(BlockEndToken(marker, marker))
[43]338
339    def add_indent(self, column):
340        # Check if we need to increase indentation.
341        if self.indent < column:
342            self.indents.append(self.indent)
343            self.indent = column
344            return True
345        return False
346
347    # Fetchers.
348
[48]349    def fetch_stream_end(self):
[39]350
351        # Set the current intendation to -1.
[43]352        self.unwind_indent(-1)
[39]353
354        # Reset everything (not really needed).
[43]355        self.allow_simple_key = False
[39]356        self.possible_simple_keys = {}
357
[43]358        # Read the token.
[46]359        marker = self.reader.get_marker()
[43]360       
361        # Add END.
[47]362        self.tokens.append(StreamEndToken(marker, marker))
[39]363
[46]364        # The reader is ended.
[39]365        self.done = True
366
[48]367    def fetch_bom(self):
368        # We consider the BOM marker as a DOCUMENT-END indicator unless it's
369        # the first character in the stream. It's a reasonable approximation
370        # of the specification requirements. We can follow the specification
371        # literally, but it will require a new token class. Probably later.
372
373        # We ignore BOM if it is the first character in the stream.
374        if self.reader.index == 0:
375            slef.reader.forward()
376
377        # Otherwise we issue DOCUMENT-END.
378        else:
379
380            # Set the current intendation to -1.
381            self.unwind_indent(-1)
382
383            # Reset simple keys. Note that there could not be a block
384            # collection after BOM.
385            self.remove_possible_simple_key()
386            self.allow_simple_key = False
387
388            # Add DOCUMENT-END.
389            start_marker = self.reader.get_marker()
390            self.reader.forward()
391            end_marker = self.reader.get_marker()
392            self.tokens.append(DocumentEndToken(start_marker, end_marker))
393
[43]394    def fetch_directive(self):
395       
396        # Set the current intendation to -1.
397        self.unwind_indent(-1)
[39]398
[43]399        # Reset simple keys.
400        self.remove_possible_simple_key()
401        self.allow_simple_key = False
[39]402
[43]403        # Scan and add DIRECTIVE.
[47]404        self.tokens.append(self.scan_directive())
[39]405
406    def fetch_document_start(self):
[44]407        self.fetch_document_indicator(DocumentStartToken)
[39]408
[43]409    def fetch_document_end(self):
[44]410        self.fetch_document_indicator(DocumentEndToken)
[43]411
412    def fetch_document_indicator(self, TokenClass):
413
[39]414        # Set the current intendation to -1.
[43]415        self.unwind_indent(-1)
[39]416
[43]417        # Reset simple keys. Note that there could not be a block collection
418        # after '---'.
419        self.remove_possible_simple_key()
420        self.allow_simple_key = False
[39]421
[43]422        # Add DOCUMENT-START or DOCUMENT-END.
[46]423        start_marker = self.reader.get_marker()
424        self.reader.forward(3)
425        end_marker = self.reader.get_marker()
[43]426        self.tokens.append(TokenClass(start_marker, end_marker))
[39]427
[43]428    def fetch_flow_sequence_start(self):
[44]429        self.fetch_flow_collection_start(FlowSequenceStartToken)
[39]430
[43]431    def fetch_flow_mapping_start(self):
[44]432        self.fetch_flow_collection_start(FlowMappingStartToken)
[43]433
434    def fetch_flow_collection_start(self, TokenClass):
435
[44]436        # '[' and '{' may start a simple key.
437        self.save_possible_simple_key()
438
[43]439        # Increase the flow level.
440        self.flow_level += 1
441
442        # Simple keys are allowed after '[' and '{'.
443        self.allow_simple_key = True
444
445        # Add FLOW-SEQUENCE-START or FLOW-MAPPING-START.
[46]446        start_marker = self.reader.get_marker()
447        self.reader.forward()
448        end_marker = self.reader.get_marker()
[43]449        self.tokens.append(TokenClass(start_marker, end_marker))
[39]450
[43]451    def fetch_flow_sequence_end(self):
[44]452        self.fetch_flow_collection_end(FlowSequenceEndToken)
[39]453
[43]454    def fetch_flow_mapping_end(self):
[44]455        self.fetch_flow_collection_end(FlowMappingEndToken)
[43]456
457    def fetch_flow_collection_end(self, TokenClass):
458
459        # Reset possible simple key on the current level.
460        self.remove_possible_simple_key()
461
462        # Decrease the flow level.
463        self.flow_level -= 1
464
465        # No simple keys after ']' or '}'.
466        self.allow_simple_key = False
467
468        # Add FLOW-SEQUENCE-END or FLOW-MAPPING-END.
[46]469        start_marker = self.reader.get_marker()
470        self.reader.forward()
471        end_marker = self.reader.get_marker()
[43]472        self.tokens.append(TokenClass(start_marker, end_marker))
[39]473
[43]474    def fetch_entry(self):
[39]475
[43]476        # Block context needs additional checks.
477        if not self.flow_level:
[39]478
[43]479            # Are we allowed to start a new entry?
480            if not self.allow_simple_key:
[47]481                raise ScannerError(None, None,
482                        "sequence entries are not allowed here",
483                        self.reader.get_marker())
[39]484
[43]485            # We may need to add BLOCK-SEQUENCE-START.
[46]486            if self.add_indent(self.reader.column):
487                marker = self.reader.get_marker()
[44]488                self.tokens.append(BlockSequenceStartToken(marker, marker))
[39]489
[43]490        # Simple keys are allowed after '-' and ','.
491        self.allow_simple_key = True
[39]492
[43]493        # Reset possible simple key on the current level.
494        self.remove_possible_simple_key()
[39]495
[43]496        # Add ENTRY.
[46]497        start_marker = self.reader.get_marker()
498        self.reader.forward()
499        end_marker = self.reader.get_marker()
[44]500        self.tokens.append(EntryToken(start_marker, end_marker))
[39]501
[43]502    def fetch_key(self):
503       
504        # Block context needs additional checks.
505        if not self.flow_level:
[39]506
[43]507            # Are we allowed to start a key (not nessesary a simple)?
508            if not self.allow_simple_key:
[47]509                raise ScannerError(None, None,
510                        "mapping keys are not allowed here",
511                        self.reader.get_marker())
[43]512
513            # We may need to add BLOCK-MAPPING-START.
[46]514            if self.add_indent(self.reader.column):
515                marker = self.reader.get_marker()
[44]516                self.tokens.append(BlockMappingStartToken(marker, marker))
[43]517
518        # Simple keys are allowed after '?' in the block context.
519        self.allow_simple_key = not self.flow_level
520
521        # Reset possible simple key on the current level.
522        self.remove_possible_simple_key()
523
524        # Add KEY.
[46]525        start_marker = self.reader.get_marker()
526        self.reader.forward()
527        end_marker = self.reader.get_marker()
[44]528        self.tokens.append(KeyToken(start_marker, end_marker))
[39]529
[43]530    def fetch_value(self):
[39]531
[43]532        # Do we determine a simple key?
533        if self.flow_level in self.possible_simple_keys:
[39]534
[43]535            # Add KEY.
536            key = self.possible_simple_keys[self.flow_level]
537            del self.possible_simple_keys[self.flow_level]
538            self.tokens.insert(key.token_number-self.tokens_taken,
[44]539                    KeyToken(key.marker, key.marker))
[39]540
[43]541            # If this key starts a new block mapping, we need to add
542            # BLOCK-MAPPING-START.
543            if not self.flow_level:
544                if self.add_indent(key.column):
545                    self.tokens.insert(key.token_number-self.tokens_taken,
[44]546                            BlockMappingStartToken(key.marker, key.marker))
[37]547
[43]548            # There cannot be two simple keys one after another.
549            self.allow_simple_key = False
[37]550
[43]551        # It must be a part of a complex key.
552        else:
553           
[47]554            # Block context needs additional checks.
555            # (Do we really need them? They will be catched by the parser
556            # anyway.)
557            if not self.flow_level:
558
559                # We are allowed to start a complex value if and only if
560                # we can start a simple key.
561                if not self.allow_simple_key:
562                    raise ScannerError(None, None,
563                            "mapping values are not allowed here",
564                            self.reader.get_marker())
565
[43]566            # Simple keys are allowed after ':' in the block context.
567            self.allow_simple_key = not self.flow_level
[37]568
[43]569            # Reset possible simple key on the current level.
570            self.remove_possible_simple_key()
[37]571
[43]572        # Add VALUE.
[46]573        start_marker = self.reader.get_marker()
574        self.reader.forward()
575        end_marker = self.reader.get_marker()
[44]576        self.tokens.append(ValueToken(start_marker, end_marker))
[37]577
[43]578    def fetch_alias(self):
[37]579
[43]580        # ALIAS could be a simple key.
581        self.save_possible_simple_key()
[37]582
[43]583        # No simple keys after ALIAS.
584        self.allow_simple_key = False
[37]585
[43]586        # Scan and add ALIAS.
[47]587        self.tokens.append(self.scan_anchor(AliasToken))
[37]588
[43]589    def fetch_anchor(self):
[37]590
[43]591        # ANCHOR could start a simple key.
592        self.save_possible_simple_key()
[37]593
[43]594        # No simple keys after ANCHOR.
595        self.allow_simple_key = False
[37]596
[43]597        # Scan and add ANCHOR.
[47]598        self.tokens.append(self.scan_anchor(AnchorToken))
[37]599
[43]600    def fetch_tag(self):
[37]601
[43]602        # TAG could start a simple key.
603        self.save_possible_simple_key()
[37]604
[43]605        # No simple keys after TAG.
606        self.allow_simple_key = False
[37]607
[43]608        # Scan and add TAG.
[47]609        self.tokens.append(self.scan_tag())
[37]610
[43]611    def fetch_literal(self):
612        self.fetch_block_scalar(folded=False)
[37]613
[43]614    def fetch_folded(self):
615        self.fetch_block_scalar(folded=True)
[37]616
[43]617    def fetch_block_scalar(self, folded):
[37]618
[43]619        # A simple key may follow a block scalar.
620        self.allow_simple_key = True
[37]621
[43]622        # Reset possible simple key on the current level.
623        self.remove_possible_simple_key()
[37]624
[43]625        # Scan and add SCALAR.
[47]626        self.tokens.append(self.scan_block_scalar(folded))
[37]627
[43]628    def fetch_single(self):
629        self.fetch_flow_scalar(double=False)
[37]630
[43]631    def fetch_double(self):
632        self.fetch_flow_scalar(double=True)
[37]633
[43]634    def fetch_flow_scalar(self, double):
[37]635
[43]636        # A flow scalar could be a simple key.
637        self.save_possible_simple_key()
[37]638
[43]639        # No simple keys after flow scalars.
640        self.allow_simple_key = False
[37]641
[43]642        # Scan and add SCALAR.
[47]643        self.tokens.append(self.scan_flow_scalar(double))
[37]644
[43]645    def fetch_plain(self):
[37]646
[43]647        # A plain scalar could be a simple key.
648        self.save_possible_simple_key()
[37]649
[43]650        # No simple keys after plain scalars. But note that `scan_plain` will
651        # change this flag if the scan is finished at the beginning of the
652        # line.
653        self.allow_simple_key = False
[37]654
[43]655        # Scan and add SCALAR. May change `allow_simple_key`.
[47]656        self.tokens.append(self.scan_plain())
[37]657
[43]658    # Checkers.
[37]659
[43]660    def check_directive(self):
[37]661
[43]662        # DIRECTIVE:        ^ '%' ...
663        # The '%' indicator is already checked.
[46]664        if self.reader.column == 0:
[43]665            return True
[37]666
[43]667    def check_document_start(self):
[37]668
[43]669        # DOCUMENT-START:   ^ '---' (' '|'\n')
[46]670        if self.reader.column == 0:
[48]671            if self.reader.prefix(3) == u'---'  \
672                    and self.reader.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
[43]673                return True
[37]674
[43]675    def check_document_end(self):
[37]676
[43]677        # DOCUMENT-END:     ^ '...' (' '|'\n')
[46]678        if self.reader.column == 0:
679            prefix = self.reader.peek(4)
[48]680            if self.reader.prefix(3) == u'...'  \
681                    and self.reader.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
[43]682                return True
[37]683
[43]684    def check_entry(self):
685
686        # ENTRY(flow context):      ','
687        if self.flow_level:
[46]688            return self.reader.peek() == u','
[43]689
690        # ENTRY(block context):     '-' (' '|'\n')
691        else:
[48]692            return self.reader.peek() == u'-'   \
693                    and self.reader.peek(1) in u'\0 \t\r\n\x85\u2028\u2029'
[43]694
695    def check_key(self):
696
697        # KEY(flow context):    '?'
698        if self.flow_level:
[37]699            return True
[43]700
701        # KEY(block context):   '?' (' '|'\n')
[37]702        else:
[48]703            return self.reader.peek(1) in u'\0 \t\r\n\x85\u2028\u2029'
[37]704
[43]705    def check_value(self):
706
707        # VALUE(flow context):  ':'
708        if self.flow_level:
[37]709            return True
[43]710
711        # VALUE(block context): ':' (' '|'\n')
[37]712        else:
[48]713            return self.reader.peek(1) in u'\0 \t\r\n\x85\u2028\u2029'
[37]714
[43]715    def check_plain(self):
[37]716
[48]717        # A plain scalar may start with any non-space character except:
718        #   '-', '?', ':', ',', '[', ']', '{', '}',
719        #   '#', '&', '*', '!', '|', '>', '\'', '\"',
720        #   '%', '@', '`'.
721        #
722        # It may also start with
723        #   '-', '?', ':'
724        # if it is followed by a non-space character.
725        #
726        # Note that we limit the last rule to the block context (except the
727        # '-' character) because we want the flow context to be space
728        # independent.
729        ch = self.reader.peek()
730        return ch not in u'\0 \t\r\n\x85\u2028\u2029-?:,[]{}#&*!|>\'\"%@`'  \
731                or (self.reader.peek(1) not in u'\0 \t\r\n\x85\u2028\u2029'
732                        and (ch == '-' or (not self.flow_level and ch in u'?:')))
733
[43]734    # Scanners.
735
736    def scan_to_next_token(self):
[47]737        # We ignore spaces, line breaks and comments.
738        # If we find a line break in the block context, we set the flag
739        # `allow_simple_key` on.
[43]740        found = False
741        while not found:
[46]742            while self.reader.peek() == u' ':
743                self.reader.forward()
744            if self.reader.peek() == u'#':
[47]745                while self.reader.peek() not in u'\0\r\n\x85\u2028\u2029':
[46]746                    self.reader.forward()
[47]747            if self.scan_line_break():
[43]748                if not self.flow_level:
749                    self.allow_simple_key = True
[37]750            else:
[43]751                found = True
[37]752
[43]753    def scan_directive(self):
[48]754        # See the specification for details.
755        start_marker = self.reader.get_marker()
756        self.reader.forward()
757        name = self.scan_directive_name(start_marker)
758        value = None
759        if name == u'YAML':
760            value = self.scan_yaml_directive_value(start_marker)
761            end_marker = self.reader.get_marker()
762        elif name == u'TAG':
763            value = self.scan_tag_directive_value(start_marker)
764            end_marker = self.reader.get_marker()
[43]765        else:
[48]766            end_marker = self.reader.get_marker()
767            while self.reader.peek() not in u'\0\r\n\x85\u2028\u2029':
768                self.reader.forward()
769        self.scan_directive_ignored_line(start_marker)
770        return DirectiveToken(name, value, start_marker, end_marker)
771
772    def scan_directive_name(self, start_marker):
773        # See the specification for details.
774        length = 0
775        ch = self.reader.peek(length)
776        while u'0' <= ch <= u'9' or u'A' <= ch <= 'Z' or u'a' <= ch <= 'z'  \
777                or ch in u'-_':
778            length += 1
779            ch = self.reader.peek(length)
780        if not length:
781            raise ScannerError("while scanning a directive", start_marker,
782                    "expected directive name, but found %r" % ch.encode('utf-8'),
783                    self.reader.get_marker())
784        value = self.reader.prefix(length)
785        self.reader.forward(length)
786        ch = self.reader.peek()
787        if ch not in u'\0 \r\n\x85\u2028\u2029':
788            raise ScannerError("while scanning a directive" % name, start_marker,
789                    "expected alphabetic or numeric character, but found %r"
790                    % ch.encode('utf-8'), self.reader.get_marker())
791        return value
792
793    def scan_yaml_directive_value(self, start_marker):
794        # See the specification for details.
795        while self.reader.peek() == u' ':
[46]796            self.reader.forward()
[48]797        major = self.scan_yaml_directive_number(start_marker)
798        if self.reader.peek() != '.':
799            raise ScannerError("while scanning a directive", start_marker,
800                    "expected a digit or '.', but found %r" % ch.encode('utf-8'),
801                    self.reader.get_marker())
[46]802        self.reader.forward()
[48]803        minor = self.scan_yaml_directive_number(start_marker)
804        if self.reader.peek() not in u'\0 \r\n\x85\u2028\u2029':
805            raise ScannerError("while scanning a directive", start_marker,
806                    "expected a digit or ' ', but found %r" % ch.encode('utf-8'),
807                    self.reader.get_marker())
808        return (major, minor)
[37]809
[48]810    def scan_yaml_directive_number(self, start_marker):
811        # See the specification for details.
812        ch = self.reader.peek()
813        if not (u'0' <= ch <= '9'):
814            raise ScannerError("while scanning a directive", start_marker,
815                    "expected a digit, but found %r" % ch.encode('utf-8'),
816                    self.reader.get_marker())
817        length = 0
818        while u'0' <= self.reader.peek(length) <= u'9':
819            length += 1
820        value = int(self.reader.prefix(length))
821        self.reader.forward(length)
822        return value
823
824    def scan_tag_directive_value(self, start_marker):
825        # See the specification for details.
826        while self.reader.peek() == u' ':
827            self.reader.forward()
828        handle = self.scan_tag_directive_handle(start_marker)
829        while self.reader.peek() == u' ':
830            self.reader.forward()
831        prefix = self.scan_tag_directive_prefix(start_marker)
832        return (handle, prefix)
833
834    def scan_tag_directive_handle(self, start_marker):
835        # See the specification for details.
836        value = self.scan_tag_handle('directive', start_marker)
837        if self.reader.peek() != u' ':
838            raise ScannerError("while scanning a directive", start_marker,
839                    "expected ' ', but found %r" % ch.encode('utf-8'),
840                    self.reader.get_marker())
841        return value
842
843    def scan_tag_directive_prefix(self, start_marker):
844        # See the specification for details.
845        value = self.scan_tag_uri('directive', start_marker)
846        ch = self.reader.peek()
847        if ch not in u'\0 \r\n\x85\u2028\u2029':
848            raise ScannerError("while scanning a directive", start_marker,
849                    "expected ' ', but found %r" % ch.encode('utf-8'),
850                    self.reader.get_marker())
851        return value
852
853    def scan_directive_ignored_line(self, start_marker):
854        # See the specification for details.
855        while self.reader.peek() == u' ':
856            self.reader.forward()
857        if self.reader.peek() == u'#':
858            while self.reader.peek() not in u'\0\r\n\x85\u2028\u2029':
859                self.reader.forward()
860        ch = self.reader.peek()
861        if ch not in u'\0\r\n\x85\u2028\u2029':
862            raise ScannerError("while scanning a directive", start_marker,
863                    "expected a comment or a line break, but found %r"
864                        % ch.encode('utf-8'), self.reader.get_marker())
865        self.scan_line_break()
866
[43]867    def scan_anchor(self, TokenClass):
[48]868        # The specification does not restrict characters for anchors and
869        # aliases. This may lead to problems, for instance, the document:
870        #   [ *alias, value ]
871        # can be interpteted in two ways, as
872        #   [ "value" ]
873        # and
874        #   [ *alias , "value" ]
875        # Therefore we restrict aliases to numbers and ASCII letters.
[46]876        start_marker = self.reader.get_marker()
[48]877        indicator = self.reader.peek()
878        if indicator == '*':
879            name = 'alias'
880        else:
881            name = 'anchor'
882        self.reader.forward()
883        length = 0
884        ch = self.reader.peek(length)
885        while u'0' <= ch <= u'9' or u'A' <= ch <= 'Z' or u'a' <= ch <= 'z'  \
886                or ch in u'-_':
887            length += 1
888            ch = self.reader.peek(length)
889        if not length:
890            raise ScannerError("while scanning an %s" % name, start_marker,
891                    "expected anchor name, but found %r" % ch.encode('utf-8'),
892                    self.reader.get_marker())
893        value = self.reader.prefix(length)
894        self.reader.forward(length)
895        ch = self.reader.peek()
896        if ch not in u'\0 \t\r\n\x85\u2028\u2029?:,]}%@`':
897            raise ScannerError("while scanning an %s" % name, start_marker,
898                    "expected alphabetic or numeric character, but found %r"
899                    % ch.encode('utf-8'), self.reader.get_marker())
[46]900        end_marker = self.reader.get_marker()
[48]901        return TokenClass(value, start_marker, end_marker)
[37]902
[43]903    def scan_tag(self):
[48]904        # See the specification for details.
[46]905        start_marker = self.reader.get_marker()
[48]906        ch = self.reader.peek(1)
907        if ch == u'<':
908            handle = None
909            self.reader.forward(2)
910            suffix = self.scan_tag_uri('tag', start_marker)
911            if self.reader.peek() != u'>':
912                raise ScannerError("while parsing a tag", start_marking,
913                        "expected '>', but got %r" % self.reader.peek().encode('utf-8'),
914                        self.reader.get_marker())
[46]915            self.reader.forward()
[48]916        elif ch in u'\0 \t\r\n\x85\u2028\u2029':
917            handle = None
918            suffix = u'!'
919            self.reader.forward()
920        else:
921            length = 1
922            use_handle = False
923            while ch not in u'\0 \r\n\x85\u2028\u2029':
924                if ch == u'!':
925                    use_handle = True
926                    break
927                length += 1
928                ch = self.reader.peek(length)
929            handle = u'!'
930            if use_handle:
931                handle = self.scan_tag_handle('tag', start_marker)
932            else:
933                handle = u'!'
934                self.reader.forward()
935            suffix = self.scan_tag_uri('tag', start_marker)
936        ch = self.reader.peek()
937        if ch not in u'\0 \r\n\x85\u2028\u2029':
938            raise ScannerError("while scanning a tag", start_marker,
939                    "expected ' ', but found %r" % ch.encode('utf-8'),
940                    self.reader.get_marker())
941        value = (handle, suffix)
[46]942        end_marker = self.reader.get_marker()
[48]943        return TagToken(value, start_marker, end_marker)
[43]944
945    def scan_block_scalar(self, folded):
[48]946        # See the specification for details.
947
948        chunks = []
[46]949        start_marker = self.reader.get_marker()
[48]950
951        # Scan the header.
952        self.reader.forward()
953        chomping, increment = self.scan_block_scalar_indicators(start_marker)
954        self.scan_block_scalar_ignored_line(start_marker)
955
956        # Determine the indentation level and go to the first non-empty line.
957        min_indent = self.indent+1
958        if min_indent < 1:
959            min_indent = 1
960        if increment is None:
961            breaks, max_indent, end_marker = self.scan_block_scalar_indentation()
962            indent = max(min_indent, max_indent)
963        else:
964            indent = min_indent+increment-1
965            breaks, end_marker = self.scan_block_scalar_breaks(indent)
966        line_break = u''
967
968        # Scan the inner part of the block scalar.
969        while self.reader.column == indent and self.reader.peek() != u'\0':
970            chunks.extend(breaks)
971            leading_non_space = self.reader.peek() not in u' \t'
972            length = 0
973            while self.reader.peek(length) not in u'\0\r\n\x85\u2028\u2029':
974                length += 1
975            chunks.append(self.reader.prefix(length))
976            self.reader.forward(length)
977            line_break = self.scan_line_break()
978            breaks, end_marker = self.scan_block_scalar_breaks(indent)
979            if self.reader.column == indent and self.reader.peek() != u'\0':
980                # Unfortunately, folding rules are ambiguous.
981                #
982                # This is the folding according to the specification:
983                #
984                #if folded and line_break == u'\n'   \
985                #        and leading_non_space and self.reader.peek() not in u' \t':
986                #    if not breaks:
987                #        chunks.append(u' ')
988                #else:
989                #    chunks.append(line_break)
990                #
991                # This is Clark Evans's interpretation (also in the spec
992                # examples):
993                #
994                if folded and line_break == u'\n':
995                    if not breaks:
996                        if self.reader.peek() not in ' \t':
997                            chunks.append(u' ')
998                        else:
999                            chunks.append(line_break)
1000                else:
1001                    chunks.append(line_break)
1002            else:
1003                break
1004
1005        # Chomp the tail.
1006        if chomping is not False:
1007            chunks.append(line_break)
1008        if chomping is True:
1009            chunks.extend(breaks)
1010
1011        # We are done.
1012        return ScalarToken(u''.join(chunks), False, start_marker, end_marker)
1013
1014    def scan_block_scalar_indicators(self, start_marker):
1015        # See the specification for details.
1016        chomping = None
1017        increment = None
1018        ch = self.reader.peek()
1019        if ch in u'+-':
1020            if ch == '+':
1021                chomping = True
1022            else:
1023                chomping = False
1024            self.reader.forward()
1025            ch = self.reader.peek()
1026            if ch in u'0123456789':
1027                increment = int(ch)
1028                if increment == 0:
1029                    raise ScannerError("while scanning a block scalar", start_marker,
1030                            "expected indentation indicator in the range 1-9, but found 0",
1031                            self.reader.get_marker())
[46]1032                self.reader.forward()
[48]1033        elif ch in u'0123456789':
1034            increment = int(ch)
1035            if increment == 0:
1036                raise ScannerError("while scanning a block scalar", start_marker,
1037                        "expected indentation indicator in the range 1-9, but found 0",
1038                        self.reader.get_marker())
1039            self.reader.forward()
1040            ch = self.reader.peek()
1041            if ch in u'+-':
1042                if ch == '+':
1043                    chomping = True
1044                else:
1045                    chomping = False
[46]1046                self.reader.forward()
[48]1047        ch = self.reader.peek()
1048        if ch not in u'\0 \r\n\x85\u2028\u2029':
1049            raise ScannerError("while scanning a block scalar", start_marker,
1050                    "expected chomping or indentation indicators, but found %r"
1051                        % ch.encode('utf-8'), self.reader.get_marker())
1052        return chomping, increment
1053
1054    def scan_block_scalar_ignored_line(self, start_marker):
1055        # See the specification for details.
1056        while self.reader.peek() == u' ':
1057            self.reader.forward()
1058        if self.reader.peek() == u'#':
1059            while self.reader.peek() not in u'\0\r\n\x85\u2028\u2029':
[46]1060                self.reader.forward()
[48]1061        ch = self.reader.peek()
1062        if ch not in u'\0\r\n\x85\u2028\u2029':
1063            raise ScannerError("while scanning a block scalar", start_marker,
1064                    "expected a comment or a line break, but found %r"
1065                        % ch.encode('utf-8'), self.reader.get_marker())
1066        self.scan_line_break()
[43]1067
[48]1068    def scan_block_scalar_indentation(self):
1069        # See the specification for details.
1070        chunks = []
1071        max_indent = 0
1072        end_marker = self.reader.get_marker()
1073        while self.reader.peek() in u' \r\n\x85\u2028\u2029':
1074            if self.reader.peek() != u' ':
1075                chunks.append(self.scan_line_break())
1076                end_marker = self.reader.get_marker()
1077            else:
1078                self.reader.forward()
1079                if self.reader.column > max_indent:
1080                    max_indent = self.reader.column
1081        return chunks, max_indent, end_marker
1082
1083    def scan_block_scalar_breaks(self, indent):
1084        # See the specification for details.
1085        chunks = []
1086        end_marker = self.reader.get_marker()
1087        while self.reader.column < indent and self.reader.peek() == u' ':
1088            self.reader.forward()
1089        while self.reader.peek() in u'\r\n\x85\u2028\u2029':
1090            chunks.append(self.scan_line_break())
1091            end_marker = self.reader.get_marker()
1092            while self.reader.column < indent and self.reader.peek() == u' ':
1093                self.reader.forward()
1094        return chunks, end_marker
1095
[43]1096    def scan_flow_scalar(self, double):
[48]1097        # See the specification for details.
1098        chunks = []
1099        start_marker = self.reader.get_marker()
1100        indent = self.indent+1
1101        if indent == 0:
1102            indent = 1
[46]1103        quote = self.reader.peek()
1104        self.reader.forward()
[48]1105        chunks.extend(self.scan_flow_scalar_non_spaces(double, indent, start_marker))
[46]1106        while self.reader.peek() != quote:
[48]1107            chunks.extend(self.scan_flow_scalar_spaces(double, indent, start_marker))
1108            chunks.extend(self.scan_flow_scalar_non_spaces(double, indent, start_marker))
1109        self.reader.forward()
1110        end_marker = self.reader.get_marker()
1111        return ScalarToken(u''.join(chunks), False, start_marker, end_marker)
1112
1113    ESCAPE_REPLACEMENTS = {
1114        u'0':   u'\0',
1115        u'a':   u'\x07',
1116        u'b':   u'\x08',
1117        u't':   u'\x09',
1118        u'\t':  u'\x09',
1119        u'n':   u'\x0A',
1120        u'v':   u'\x0B',
1121        u'f':   u'\x0C',
1122        u'r':   u'\x0D',
1123        u'e':   u'\x1B',
1124        u' ':   u'\x20',
1125        u'\"':  u'\"',
1126        u'\\':  u'\\',
1127        u'N':   u'\x85',
1128        u'_':   u'\xA0',
1129        u'L':   u'\u2028',
1130        u'P':   u'\u2029',
1131    }
1132
1133    ESCAPE_CODES = {
1134        u'x':   2,
1135        u'u':   4,
1136        u'U':   8,
1137    }
1138
1139    def scan_flow_scalar_non_spaces(self, double, indent, start_marker):
1140        # See the specification for details.
1141        chunks = []
1142        while True:
1143            length = 0
1144            while self.reader.peek(length) not in u'\'\"\\\0 \t\r\n\x85\u2028\u2029':
1145                length += 1
1146            if length:
1147                chunks.append(self.reader.prefix(length))
1148                self.reader.forward(length)
1149            ch = self.reader.peek()
1150            if not double and ch == u'\'' and self.reader.peek(1) == u'\'':
1151                chunks.append(u'\'')
[46]1152                self.reader.forward(2)
[48]1153            elif (double and ch == u'\'') or (not double and ch in u'\"\\'):
1154                chunks.append(ch)
1155                self.reader.forward()
1156            elif double and ch == u'\\':
1157                self.reader.forward()
1158                ch = self.reader.peek()
1159                if ch in self.ESCAPE_REPLACEMENTS:
1160                    chunks.append(self.ESCAPE_REPLACEMENTS[ch])
1161                    self.reader.forward()
1162                elif ch in self.ESCAPE_CODES:
1163                    length = self.ESCAPE_CODES[ch]
1164                    self.reader.forward()
1165                    for k in range(length):
1166                        if self.reader.peek(k) not in u'0123456789ABCDEFabcdef':
1167                            raise ScannerError("while scanning a double-quoted scalar", start_marker,
1168                                    "expected escape sequence of %d hexdecimal numbers, but found %r" %
1169                                        (length, self.reader.peek(k).encode('utf-8')), self.reader.get_marker())
1170                    code = int(self.reader.prefix(length), 16)
1171                    chunks.append(unichr(code))
1172                    self.reader.forward(length)
1173                elif ch in u'\r\n\x85\u2028\u2029':
1174                    self.scan_line_break()
1175                    chunks.extend(self.scan_flow_scalar_breaks(double, indent, start_marker))
1176                else:
1177                    raise ScannerError("while scanning a double-quoted scalar", start_marker,
1178                            "found unknown escape character %r" % ch.encode('utf-8'), self.reader.get_marker())
[37]1179            else:
[48]1180                return chunks
[37]1181
[48]1182    def scan_flow_scalar_spaces(self, double, indent, start_marker):
1183        # See the specification for details.
1184        chunks = []
1185        length = 0
1186        while self.reader.peek(length) in u' \t':
1187            length += 1
1188        whitespaces = self.reader.prefix(length)
1189        self.reader.forward(length)
1190        ch = self.reader.peek()
1191        if ch == u'\0':
1192            raise ScannerError("while scanning a quoted scalar", start_marker,
1193                    "found unexpected end of stream", self.reader.get_marker())
1194        elif ch in u'\r\n\x85\u2028\u2029':
1195            line_break = self.scan_line_break()
1196            breaks = self.scan_flow_scalar_breaks(double, indent, start_marker)
1197            if line_break != u'\n':
1198                chunks.append(line_break)
1199            elif not breaks:
1200                chunks.append(u' ')
1201            chunks.extend(breaks)
1202        else:
1203            chunks.append(whitespaces)
1204        return chunks
1205
1206    def scan_flow_scalar_breaks(self, double, indent, start_marker):
1207        # See the specification for details.
1208        chunks = []
1209        while True:
1210            while self.reader.column < indent and self.reader.peek() == u' ':
1211                self.reader.forward()
1212            if self.reader.column < indent  \
1213                    and self.reader.peek() not in u'\0\r\n\x85\u2028\u2029':
1214                s = 's'
1215                if indent == 1:
1216                    s = ''
1217                raise ScannerError("while scanning a quoted scalar", start_marker,
1218                        "expected %d space%s indentation, but found %r"
1219                        % (indent, s, self.reader.peek().encode('utf-8')),
1220                        self.reader.get_marker())
1221            while self.reader.peek() in u' \t':
1222                self.reader.forward()
1223            if self.reader.peek() in u'\r\n\x85\u2028\u2029':
1224                chunks.append(self.scan_line_break())
1225            else:
1226                return chunks
1227
[43]1228    def scan_plain(self):
[48]1229        # See the specification for details.
1230        # We add an additional restriction for the flow context:
1231        #   plain scalars in the flow context cannot contain ':' and '?'.
1232        # We also keep track of the `allow_simple_key` flag here.
1233        chunks = []
1234        start_marker = self.reader.get_marker()
1235        end_marker = start_marker
[43]1236        indent = self.indent+1
[48]1237        if indent == 0:
[43]1238            indent = 1
[48]1239        spaces = []
[43]1240        while True:
[48]1241            length = 0
1242            if self.reader.peek() == u'#':
[43]1243                break
[48]1244            while True:
1245                ch = self.reader.peek(length)
1246                if ch in u'\0 \t\r\n\x85\u2028\u2029'   \
1247                        or (not self.flow_level and ch == u':' and
1248                                self.reader.peek(length+1) in u'\0 \t\r\n\x28\u2028\u2029') \
1249                        or (self.flow_level and ch in u',:?[]{}'):
1250                    break
1251                length += 1
1252            if length == 0:
[43]1253                break
[48]1254            self.allow_simple_key = False
1255            chunks.extend(spaces)
1256            chunks.append(self.reader.prefix(length))
1257            self.reader.forward(length)
1258            end_marker = self.reader.get_marker()
1259            spaces = self.scan_plain_spaces(indent)
1260            if not spaces or self.reader.peek() == u'#' \
1261                    or self.reader.column < indent:
1262                break
1263        return ScalarToken(u''.join(chunks), True, start_marker, end_marker)
[37]1264
[48]1265    def scan_plain_spaces(self, indent):
1266        # See the specification for details.
1267        # The specification is really confusing about tabs in plain scalars.
1268        # We just forbid them completely. Do not use tabs in YAML!
1269        chunks = []
1270        length = 0
1271        while self.reader.peek(length) in u' ':
1272            length += 1
1273        whitespaces = self.reader.prefix(length)
1274        self.reader.forward(length)
1275        ch = self.reader.peek()
1276        if ch in u'\r\n\x85\u2028\u2029':
1277            line_break = self.scan_line_break()
1278            self.allow_simple_key = True
1279            breaks = []
1280            while self.reader.peek() in u' \r\n\x85\u2028\u2029':
1281                if self.reader.peek() == ' ':
1282                    self.reader.forward()
1283                else:
1284                    breaks.append(self.scan_line_break())
1285            if line_break != u'\n':
1286                chunks.append(line_break)
1287            elif not breaks:
1288                chunks.append(u' ')
1289            chunks.extend(breaks)
1290        elif whitespaces:
1291            chunks.append(whitespaces)
1292        return chunks
1293
1294    def scan_tag_handle(self, name, start_marker):
1295        # See the specification for details.
1296        # For some strange reasons, the specification does not allow '_' in
1297        # tag handles. I have allowed it anyway.
1298        if self.reader.peek() != u'!':
1299            raise ScannerError("while scanning a %s" % name, start_marker,
1300                    "expected '!', but found %r" % ch.encode('utf-8'),
1301                    self.reader.get_marker())
1302        length = 1
1303        ch = self.reader.peek(length)
1304        if ch != u' ':
1305            while u'0' <= ch <= u'9' or u'A' <= ch <= 'Z' or u'a' <= ch <= 'z'  \
1306                    or ch in u'-_':
1307                length += 1
1308                ch = self.reader.peek(length)
1309            if ch != u'!':
1310                self.reader.forward(length)
1311                raise ScannerError("while scanning a %s" % name, start_marker,
1312                        "expected '!', but found %r" % ch.encode('utf-8'),
1313                        self.reader.get_marker())
1314            length += 1
1315        value = self.reader.prefix(length)
1316        self.reader.forward(length)
1317        return value
1318
1319    def scan_tag_uri(self, name, start_marker):
1320        # See the specification for details.
1321        # Note: we do not check if URI is well-formed.
1322        chunks = []
1323        length = 0
1324        ch = self.reader.peek(length)
1325        while u'0' <= ch <= u'9' or u'A' <= ch <= 'Z' or u'a' <= ch <= 'z'  \
1326                or ch in u'-;/?:@&=+$,_.!~*\'()[]%':
1327            if ch == u'%':
1328                chunks.append(self.reader.prefix(length))
1329                self.reader.forward(length)
1330                length = 0
1331                chunks.append(self.scan_uri_escapes(name, start_marker))
1332            else:
1333                length += 1
1334            ch = self.reader.peek(length)
1335        if length:
1336            chunks.append(self.reader.prefix(length))
1337            self.reader.forward(length)
1338            length = 0
1339        if not chunks:
1340            raise ScannerError("while parsing a %s" % name, start_marker,
1341                    "expected URI, but found %r" % ch.encode('utf-8'),
1342                    self.reader.get_marker())
1343        return u''.join(chunks)
1344
1345    def scan_uri_escapes(self, name, start_marker):
1346        # See the specification for details.
1347        bytes = []
1348        marker = self.reader.get_marker()
1349        while self.reader.peek() == u'%':
1350            self.reader.forward()
1351            for k in range(2):
1352                if self.reader.peek(k) not in u'0123456789ABCDEFabcdef':
1353                    raise ScannerError("while scanning a %s" % name, start_marker,
1354                            "expected URI escape sequence of 2 hexdecimal numbers, but found %r" %
1355                                (self.reader.peek(k).encode('utf-8')), self.reader.get_marker())
1356            bytes.append(chr(int(self.reader.prefix(2), 16)))
1357            self.reader.forward(2)
1358        try:
1359            value = unicode(''.join(bytes), 'utf-8')
1360        except UnicodeDecodeError, exc:
1361            raise ScannerError("while scanning a %s" % name, start_marker, str(exc), marker)
1362        return value
1363
[47]1364    def scan_line_break(self):
1365        # Transforms:
1366        #   '\r\n'      :   '\n'
1367        #   '\r'        :   '\n'
1368        #   '\n'        :   '\n'
1369        #   '\x85'      :   '\n'
1370        #   '\u2028'    :   '\u2028'
1371        #   '\u2029     :   '\u2029'
1372        #   default     :   ''
1373        ch = self.reader.peek()
1374        if ch in u'\r\n\x85':
[48]1375            if self.reader.prefix(2) == u'\r\n':
[47]1376                self.forward(2)
1377            else:
1378                self.reader.forward()
1379            return u'\n'
1380        elif ch in u'\u2028\u2029':
1381            self.reader.forward()
1382            return ch
1383        return u''
1384
[45]1385#try:
1386#    import psyco
1387#    psyco.bind(Scanner)
1388#except ImportError:
1389#    pass
1390
Note: See TracBrowser for help on using the repository browser.