source: branches/pyyaml3000/lib/yaml/scanner.py @ 60

Revision 60, 51.6 KB checked in by xi, 9 years ago (diff)

Fixed a silly bug in scan_line_break.
Thanks Gustavo Rahal for reporting it.

RevLine 
[39]1
[55]2# Scanner produces tokens of the following types:
3# DIRECTIVE(name, value)
4# DOCUMENT-START
5# DOCUMENT-END
6# STREAM-END
7# BLOCK-SEQUENCE-START
8# BLOCK-MAPPING-START
9# BLOCK-END
10# FLOW-SEQUENCE-START
11# FLOW-MAPPING-START
12# FLOW-SEQUENCE-END
13# FLOW-MAPPING-END
14# BLOCK-ENTRY
15# FLOW-ENTRY
16# KEY
17# VALUE
18# ALIAS(value)
19# ANCHOR(value)
20# TAG(value)
21# SCALAR(value, plain)
[57]22#
23# Read comments in the Scanner code for more details.
24#
[43]25
[46]26__all__ = ['Scanner', 'ScannerError']
[43]27
[52]28from error import MarkedYAMLError
[46]29from tokens import *
[39]30
[52]31class ScannerError(MarkedYAMLError):
32    pass
[51]33
[43]34class SimpleKey:
[51]35    # See below simple keys treatment.
36
[43]37    def __init__(self, token_number, required, index, line, column, marker):
38        self.token_number = token_number
39        self.required = required
40        self.index = index
41        self.line = line
42        self.column = column
43        self.marker = marker
44
[39]45class Scanner:
46
[46]47
48    def __init__(self, reader):
[39]49        """Initialize the scanner."""
[46]50        # The input stream. The Reader class do the dirty work of checking for
[43]51        # BOM and converting the input data to Unicode. It also adds NUL to
52        # the end.
[39]53        #
[46]54        # Reader supports the following methods
[48]55        #   self.reader.peek(i=0)       # peek the next i-th character
56        #   self.reader.prefix(l=1)     # peek the next l characters
57        #   self.reader.forward(l=1)    # read the next l characters
58                                        # and move the pointer
[46]59        self.reader = reader
[39]60
61        # Had we reached the end of the stream?
62        self.done = False
63
64        # The number of unclosed '{' and '['. `flow_level == 0` means block
65        # context.
66        self.flow_level = 0
67
68        # List of processed tokens that are not yet emitted.
69        self.tokens = []
70
71        # Number of tokens that were emitted through the `get_token` method.
72        self.tokens_taken = 0
73
74        # The current indentation level.
75        self.indent = -1
76
77        # Past indentation levels.
78        self.indents = []
79
[43]80        # Variables related to simple keys treatment.
[39]81
82        # A simple key is a key that is not denoted by the '?' indicator.
83        # Example of simple keys:
84        #   ---
85        #   block simple key: value
86        #   ? not a simple key:
87        #   : { flow simple key: value }
88        # We emit the KEY token before all keys, so when we find a potential
89        # simple key, we try to locate the corresponding ':' indicator.
90        # Simple keys should be limited to a single line and 1024 characters.
91
[43]92        # Can a simple key start at the current position? A simple key may
93        # start:
94        # - at the beginning of the line, not counting indentation spaces
95        #       (in block context),
96        # - after '{', '[', ',' (in the flow context),
97        # - after '?', ':', '-' (in the block context).
[60]98        # In the block context, this flag also signifies if a block collection
[43]99        # may start at the current position.
100        self.allow_simple_key = True
[39]101
102        # Keep track of possible simple keys. This is a dictionary. The key
103        # is `flow_level`; there can be no more that one possible simple key
[43]104        # for each level. The value is a SimpleKey record:
105        #   (token_number, required, index, line, column, marker)
106        # A simple key may start with ALIAS, ANCHOR, TAG, SCALAR(flow),
107        # '[', or '{' tokens.
[39]108        self.possible_simple_keys = {}
109
[51]110    # Public methods.
[39]111
[51]112    def check(self, *choices):
113        # Check if the next token is one of the given types.
[43]114        while self.need_more_tokens():
[39]115            self.fetch_more_tokens()
116        if self.tokens:
[51]117            for choice in choices:
118                if isinstance(self.tokens[0], choice):
119                    return True
120        return False
121
122    def peek(self):
123        # Return the next token, but do not delete if from the queue.
124        while self.need_more_tokens():
125            self.fetch_more_tokens()
126        if self.tokens:
[39]127            return self.tokens[0]
128
[51]129    def get(self):
130        # Return the next token.
[39]131        while self.need_more_tokens():
132            self.fetch_more_tokens()
133        if self.tokens:
134            self.tokens_taken += 1
135            return self.tokens.pop(0)
136
[51]137    def __iter__(self):
138        # Iterator protocol.
139        while self.need_more_tokens():
140            self.fetch_more_tokens()
141        while self.tokens:
142            self.tokens_taken += 1
143            yield self.tokens.pop(0)
144            while self.need_more_tokens():
145                self.fetch_more_tokens()
146
[43]147    # Private methods.
[39]148
149    def need_more_tokens(self):
150        if self.done:
151            return False
152        if not self.tokens:
153            return True
154        # The current token may be a potential simple key, so we
155        # need to look further.
[43]156        self.stale_possible_simple_keys()
[39]157        if self.next_possible_simple_key() == self.tokens_taken:
158            return True
159
160    def fetch_more_tokens(self):
161
162        # Eat whitespaces and comments until we reach the next token.
[43]163        self.scan_to_next_token()
[39]164
[43]165        # Remove obsolete possible simple keys.
166        self.stale_possible_simple_keys()
167
[39]168        # Compare the current indentation and column. It may add some tokens
[43]169        # and decrease the current indentation level.
[46]170        self.unwind_indent(self.reader.column)
[39]171
172        # Peek the next character.
[46]173        ch = self.reader.peek()
[39]174
[48]175        # Is it the end of stream?
[43]176        if ch == u'\0':
[48]177            return self.fetch_stream_end()
[39]178
179        # Is it a directive?
180        if ch == u'%' and self.check_directive():
181            return self.fetch_directive()
182
183        # Is it the document start?
184        if ch == u'-' and self.check_document_start():
185            return self.fetch_document_start()
186
187        # Is it the document end?
188        if ch == u'.' and self.check_document_end():
189            return self.fetch_document_end()
190
[52]191        # TODO: support for BOM within a stream.
192        #if ch == u'\uFEFF':
193        #    return self.fetch_bom()    <-- issue BOMToken
194
[39]195        # Note: the order of the following checks is NOT significant.
196
197        # Is it the flow sequence start indicator?
198        if ch == u'[':
199            return self.fetch_flow_sequence_start()
200
201        # Is it the flow mapping start indicator?
202        if ch == u'{':
203            return self.fetch_flow_mapping_start()
204
205        # Is it the flow sequence end indicator?
206        if ch == u']':
207            return self.fetch_flow_sequence_end()
208
209        # Is it the flow mapping end indicator?
210        if ch == u'}':
211            return self.fetch_flow_mapping_end()
212
[51]213        # Is it the flow entry indicator?
214        if ch in u',':
215            return self.fetch_flow_entry()
[43]216
[51]217        # Is it the block entry indicator?
218        if ch in u'-' and self.check_block_entry():
219            return self.fetch_block_entry()
220
[39]221        # Is it the key indicator?
222        if ch == u'?' and self.check_key():
223            return self.fetch_key()
224
225        # Is it the value indicator?
226        if ch == u':' and self.check_value():
227            return self.fetch_value()
228
229        # Is it an alias?
230        if ch == u'*':
231            return self.fetch_alias()
232
233        # Is it an anchor?
234        if ch == u'&':
235            return self.fetch_anchor()
236
[43]237        # Is it a tag?
[39]238        if ch == u'!':
239            return self.fetch_tag()
240
[43]241        # Is it a literal scalar?
242        if ch == u'|' and not self.flow_level:
[39]243            return self.fetch_literal()
244
245        # Is it a folded scalar?
[43]246        if ch == u'>' and not self.flow_level:
[39]247            return self.fetch_folded()
248
249        # Is it a single quoted scalar?
250        if ch == u'\'':
251            return self.fetch_single()
252
253        # Is it a double quoted scalar?
254        if ch == u'\"':
255            return self.fetch_double()
256
[43]257        # It must be a plain scalar then.
[39]258        if self.check_plain():
259            return self.fetch_plain()
260
[43]261        # No? It's an error. Let's produce a nice error message.
[48]262        raise ScannerError("while scanning for the next token", None,
263                "found character %r that cannot start any token"
264                % ch.encode('utf-8'), self.reader.get_marker())
[39]265
[43]266    # Simple keys treatment.
267
268    def next_possible_simple_key(self):
269        # Return the number of the nearest possible simple key. Actually we
270        # don't need to loop through the whole dictionary. We may replace it
271        # with the following code:
272        #   if not self.possible_simple_keys:
273        #       return None
274        #   return self.possible_simple_keys[
275        #           min(self.possible_simple_keys.keys())].token_number
276        min_token_number = None
277        for level in self.possible_simple_keys:
278            key = self.possible_simple_keys[level]
279            if min_token_number is None or key.token_number < min_token_number:
280                min_token_number = key.token_number
281        return min_token_number
282
283    def stale_possible_simple_keys(self):
284        # Remove entries that are no longer possible simple keys. According to
285        # the YAML specification, simple keys
286        # - should be limited to a single line,
287        # - should be no longer than 1024 characters.
288        # Disabling this procedure will allow simple keys of any length and
289        # height (may cause problems if indentation is broken though).
290        for level in self.possible_simple_keys.keys():
291            key = self.possible_simple_keys[level]
[46]292            if key.line != self.reader.line  \
293                    or self.reader.index-key.index > 1024:
[43]294                if key.required:
[47]295                    raise ScannerError("while scanning a simple key", key.marker,
296                            "could not found expected ':'", self.reader.get_marker())
[43]297                del self.possible_simple_keys[level]
298
299    def save_possible_simple_key(self):
300        # The next token may start a simple key. We check if it's possible
301        # and save its position. This function is called for
302        #   ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'.
303
304        # Check if a simple key is required at the current position.
[46]305        required = not self.flow_level and self.indent == self.reader.column
[43]306
[47]307        # A simple key is required only if it is the first token in the current
308        # line. Therefore it is always allowed.
309        assert self.allow_simple_key or not required
310
[43]311        # The next token might be a simple key. Let's save it's number and
312        # position.
313        if self.allow_simple_key:
314            self.remove_possible_simple_key()
315            token_number = self.tokens_taken+len(self.tokens)
[46]316            index = self.reader.index
317            line = self.reader.line
318            column = self.reader.column
319            marker = self.reader.get_marker()
[43]320            key = SimpleKey(token_number, required,
321                    index, line, column, marker)
322            self.possible_simple_keys[self.flow_level] = key
323
324    def remove_possible_simple_key(self):
325        # Remove the saved possible key position at the current flow level.
326        if self.flow_level in self.possible_simple_keys:
327            key = self.possible_simple_keys[self.flow_level]
[47]328           
329            # I don't think it's possible, but I could be wrong.
330            assert not key.required
331            #if key.required:
332            #    raise ScannerError("while scanning a simple key", key.marker,
333            #            "could not found expected ':'", self.reader.get_marker())
[43]334
335    # Indentation functions.
336
337    def unwind_indent(self, column):
338
339        # In flow context, tokens should respect indentation.
[47]340        # Actually the condition should be `self.indent >= column` according to
341        # the spec. But this condition will prohibit intuitively correct
342        # constructions such as
343        # key : {
344        # }
[43]345        if self.flow_level and self.indent > column:
[47]346            raise ScannerError(None, None,
347                    "invalid intendation or unclosed '[' or '{'",
348                    self.reader.get_marker())
[43]349
350        # In block context, we may need to issue the BLOCK-END tokens.
351        while self.indent > column:
[46]352            marker = self.reader.get_marker()
[43]353            self.indent = self.indents.pop()
[44]354            self.tokens.append(BlockEndToken(marker, marker))
[43]355
356    def add_indent(self, column):
357        # Check if we need to increase indentation.
358        if self.indent < column:
359            self.indents.append(self.indent)
360            self.indent = column
361            return True
362        return False
363
364    # Fetchers.
365
[48]366    def fetch_stream_end(self):
[39]367
368        # Set the current intendation to -1.
[43]369        self.unwind_indent(-1)
[39]370
371        # Reset everything (not really needed).
[43]372        self.allow_simple_key = False
[39]373        self.possible_simple_keys = {}
374
[43]375        # Read the token.
[46]376        marker = self.reader.get_marker()
[43]377       
378        # Add END.
[47]379        self.tokens.append(StreamEndToken(marker, marker))
[39]380
[46]381        # The reader is ended.
[39]382        self.done = True
383
[43]384    def fetch_directive(self):
385       
386        # Set the current intendation to -1.
387        self.unwind_indent(-1)
[39]388
[43]389        # Reset simple keys.
390        self.remove_possible_simple_key()
391        self.allow_simple_key = False
[39]392
[43]393        # Scan and add DIRECTIVE.
[47]394        self.tokens.append(self.scan_directive())
[39]395
396    def fetch_document_start(self):
[44]397        self.fetch_document_indicator(DocumentStartToken)
[39]398
[43]399    def fetch_document_end(self):
[44]400        self.fetch_document_indicator(DocumentEndToken)
[43]401
402    def fetch_document_indicator(self, TokenClass):
403
[39]404        # Set the current intendation to -1.
[43]405        self.unwind_indent(-1)
[39]406
[43]407        # Reset simple keys. Note that there could not be a block collection
408        # after '---'.
409        self.remove_possible_simple_key()
410        self.allow_simple_key = False
[39]411
[43]412        # Add DOCUMENT-START or DOCUMENT-END.
[46]413        start_marker = self.reader.get_marker()
414        self.reader.forward(3)
415        end_marker = self.reader.get_marker()
[43]416        self.tokens.append(TokenClass(start_marker, end_marker))
[39]417
[43]418    def fetch_flow_sequence_start(self):
[44]419        self.fetch_flow_collection_start(FlowSequenceStartToken)
[39]420
[43]421    def fetch_flow_mapping_start(self):
[44]422        self.fetch_flow_collection_start(FlowMappingStartToken)
[43]423
424    def fetch_flow_collection_start(self, TokenClass):
425
[44]426        # '[' and '{' may start a simple key.
427        self.save_possible_simple_key()
428
[43]429        # Increase the flow level.
430        self.flow_level += 1
431
432        # Simple keys are allowed after '[' and '{'.
433        self.allow_simple_key = True
434
435        # Add FLOW-SEQUENCE-START or FLOW-MAPPING-START.
[46]436        start_marker = self.reader.get_marker()
437        self.reader.forward()
438        end_marker = self.reader.get_marker()
[43]439        self.tokens.append(TokenClass(start_marker, end_marker))
[39]440
[43]441    def fetch_flow_sequence_end(self):
[44]442        self.fetch_flow_collection_end(FlowSequenceEndToken)
[39]443
[43]444    def fetch_flow_mapping_end(self):
[44]445        self.fetch_flow_collection_end(FlowMappingEndToken)
[43]446
447    def fetch_flow_collection_end(self, TokenClass):
448
449        # Reset possible simple key on the current level.
450        self.remove_possible_simple_key()
451
452        # Decrease the flow level.
453        self.flow_level -= 1
454
455        # No simple keys after ']' or '}'.
456        self.allow_simple_key = False
457
458        # Add FLOW-SEQUENCE-END or FLOW-MAPPING-END.
[46]459        start_marker = self.reader.get_marker()
460        self.reader.forward()
461        end_marker = self.reader.get_marker()
[43]462        self.tokens.append(TokenClass(start_marker, end_marker))
[39]463
[51]464    def fetch_flow_entry(self):
[39]465
[51]466        # Simple keys are allowed after ','.
467        self.allow_simple_key = True
468
469        # Reset possible simple key on the current level.
470        self.remove_possible_simple_key()
471
472        # Add FLOW-ENTRY.
473        start_marker = self.reader.get_marker()
474        self.reader.forward()
475        end_marker = self.reader.get_marker()
476        self.tokens.append(FlowEntryToken(start_marker, end_marker))
477
478    def fetch_block_entry(self):
479
[43]480        # Block context needs additional checks.
481        if not self.flow_level:
[39]482
[43]483            # Are we allowed to start a new entry?
484            if not self.allow_simple_key:
[47]485                raise ScannerError(None, None,
486                        "sequence entries are not allowed here",
487                        self.reader.get_marker())
[39]488
[43]489            # We may need to add BLOCK-SEQUENCE-START.
[46]490            if self.add_indent(self.reader.column):
491                marker = self.reader.get_marker()
[44]492                self.tokens.append(BlockSequenceStartToken(marker, marker))
[39]493
[51]494        # It's an error for the block entry to occur in the flow context,
495        # but we let the parser detect this.
496        else:
497            pass
498
499        # Simple keys are allowed after '-'.
[43]500        self.allow_simple_key = True
[39]501
[43]502        # Reset possible simple key on the current level.
503        self.remove_possible_simple_key()
[39]504
[51]505        # Add BLOCK-ENTRY.
[46]506        start_marker = self.reader.get_marker()
507        self.reader.forward()
508        end_marker = self.reader.get_marker()
[51]509        self.tokens.append(BlockEntryToken(start_marker, end_marker))
[39]510
[43]511    def fetch_key(self):
512       
513        # Block context needs additional checks.
514        if not self.flow_level:
[39]515
[43]516            # Are we allowed to start a key (not nessesary a simple)?
517            if not self.allow_simple_key:
[47]518                raise ScannerError(None, None,
519                        "mapping keys are not allowed here",
520                        self.reader.get_marker())
[43]521
522            # We may need to add BLOCK-MAPPING-START.
[46]523            if self.add_indent(self.reader.column):
524                marker = self.reader.get_marker()
[44]525                self.tokens.append(BlockMappingStartToken(marker, marker))
[43]526
527        # Simple keys are allowed after '?' in the block context.
528        self.allow_simple_key = not self.flow_level
529
530        # Reset possible simple key on the current level.
531        self.remove_possible_simple_key()
532
533        # Add KEY.
[46]534        start_marker = self.reader.get_marker()
535        self.reader.forward()
536        end_marker = self.reader.get_marker()
[44]537        self.tokens.append(KeyToken(start_marker, end_marker))
[39]538
[43]539    def fetch_value(self):
[39]540
[43]541        # Do we determine a simple key?
542        if self.flow_level in self.possible_simple_keys:
[39]543
[43]544            # Add KEY.
545            key = self.possible_simple_keys[self.flow_level]
546            del self.possible_simple_keys[self.flow_level]
547            self.tokens.insert(key.token_number-self.tokens_taken,
[44]548                    KeyToken(key.marker, key.marker))
[39]549
[43]550            # If this key starts a new block mapping, we need to add
551            # BLOCK-MAPPING-START.
552            if not self.flow_level:
553                if self.add_indent(key.column):
554                    self.tokens.insert(key.token_number-self.tokens_taken,
[44]555                            BlockMappingStartToken(key.marker, key.marker))
[37]556
[43]557            # There cannot be two simple keys one after another.
558            self.allow_simple_key = False
[37]559
[43]560        # It must be a part of a complex key.
561        else:
562           
[47]563            # Block context needs additional checks.
564            # (Do we really need them? They will be catched by the parser
565            # anyway.)
566            if not self.flow_level:
567
568                # We are allowed to start a complex value if and only if
569                # we can start a simple key.
570                if not self.allow_simple_key:
571                    raise ScannerError(None, None,
572                            "mapping values are not allowed here",
573                            self.reader.get_marker())
574
[43]575            # Simple keys are allowed after ':' in the block context.
576            self.allow_simple_key = not self.flow_level
[37]577
[43]578            # Reset possible simple key on the current level.
579            self.remove_possible_simple_key()
[37]580
[43]581        # Add VALUE.
[46]582        start_marker = self.reader.get_marker()
583        self.reader.forward()
584        end_marker = self.reader.get_marker()
[44]585        self.tokens.append(ValueToken(start_marker, end_marker))
[37]586
[43]587    def fetch_alias(self):
[37]588
[43]589        # ALIAS could be a simple key.
590        self.save_possible_simple_key()
[37]591
[43]592        # No simple keys after ALIAS.
593        self.allow_simple_key = False
[37]594
[43]595        # Scan and add ALIAS.
[47]596        self.tokens.append(self.scan_anchor(AliasToken))
[37]597
[43]598    def fetch_anchor(self):
[37]599
[43]600        # ANCHOR could start a simple key.
601        self.save_possible_simple_key()
[37]602
[43]603        # No simple keys after ANCHOR.
604        self.allow_simple_key = False
[37]605
[43]606        # Scan and add ANCHOR.
[47]607        self.tokens.append(self.scan_anchor(AnchorToken))
[37]608
[43]609    def fetch_tag(self):
[37]610
[43]611        # TAG could start a simple key.
612        self.save_possible_simple_key()
[37]613
[43]614        # No simple keys after TAG.
615        self.allow_simple_key = False
[37]616
[43]617        # Scan and add TAG.
[47]618        self.tokens.append(self.scan_tag())
[37]619
[43]620    def fetch_literal(self):
621        self.fetch_block_scalar(folded=False)
[37]622
[43]623    def fetch_folded(self):
624        self.fetch_block_scalar(folded=True)
[37]625
[43]626    def fetch_block_scalar(self, folded):
[37]627
[43]628        # A simple key may follow a block scalar.
629        self.allow_simple_key = True
[37]630
[43]631        # Reset possible simple key on the current level.
632        self.remove_possible_simple_key()
[37]633
[43]634        # Scan and add SCALAR.
[47]635        self.tokens.append(self.scan_block_scalar(folded))
[37]636
[43]637    def fetch_single(self):
638        self.fetch_flow_scalar(double=False)
[37]639
[43]640    def fetch_double(self):
641        self.fetch_flow_scalar(double=True)
[37]642
[43]643    def fetch_flow_scalar(self, double):
[37]644
[43]645        # A flow scalar could be a simple key.
646        self.save_possible_simple_key()
[37]647
[43]648        # No simple keys after flow scalars.
649        self.allow_simple_key = False
[37]650
[43]651        # Scan and add SCALAR.
[47]652        self.tokens.append(self.scan_flow_scalar(double))
[37]653
[43]654    def fetch_plain(self):
[37]655
[43]656        # A plain scalar could be a simple key.
657        self.save_possible_simple_key()
[37]658
[43]659        # No simple keys after plain scalars. But note that `scan_plain` will
660        # change this flag if the scan is finished at the beginning of the
661        # line.
662        self.allow_simple_key = False
[37]663
[43]664        # Scan and add SCALAR. May change `allow_simple_key`.
[47]665        self.tokens.append(self.scan_plain())
[37]666
[43]667    # Checkers.
[37]668
[43]669    def check_directive(self):
[37]670
[43]671        # DIRECTIVE:        ^ '%' ...
672        # The '%' indicator is already checked.
[46]673        if self.reader.column == 0:
[43]674            return True
[37]675
[43]676    def check_document_start(self):
[37]677
[43]678        # DOCUMENT-START:   ^ '---' (' '|'\n')
[46]679        if self.reader.column == 0:
[48]680            if self.reader.prefix(3) == u'---'  \
681                    and self.reader.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
[43]682                return True
[37]683
[43]684    def check_document_end(self):
[37]685
[43]686        # DOCUMENT-END:     ^ '...' (' '|'\n')
[46]687        if self.reader.column == 0:
688            prefix = self.reader.peek(4)
[48]689            if self.reader.prefix(3) == u'...'  \
690                    and self.reader.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
[43]691                return True
[37]692
[51]693    def check_block_entry(self):
[43]694
[51]695        # BLOCK-ENTRY:      '-' (' '|'\n')
696        return self.reader.peek(1) in u'\0 \t\r\n\x85\u2028\u2029'
[43]697
698    def check_key(self):
699
700        # KEY(flow context):    '?'
701        if self.flow_level:
[37]702            return True
[43]703
704        # KEY(block context):   '?' (' '|'\n')
[37]705        else:
[48]706            return self.reader.peek(1) in u'\0 \t\r\n\x85\u2028\u2029'
[37]707
[43]708    def check_value(self):
709
710        # VALUE(flow context):  ':'
711        if self.flow_level:
[37]712            return True
[43]713
714        # VALUE(block context): ':' (' '|'\n')
[37]715        else:
[48]716            return self.reader.peek(1) in u'\0 \t\r\n\x85\u2028\u2029'
[37]717
[43]718    def check_plain(self):
[37]719
[48]720        # A plain scalar may start with any non-space character except:
721        #   '-', '?', ':', ',', '[', ']', '{', '}',
722        #   '#', '&', '*', '!', '|', '>', '\'', '\"',
723        #   '%', '@', '`'.
724        #
725        # It may also start with
726        #   '-', '?', ':'
727        # if it is followed by a non-space character.
728        #
729        # Note that we limit the last rule to the block context (except the
730        # '-' character) because we want the flow context to be space
731        # independent.
732        ch = self.reader.peek()
733        return ch not in u'\0 \t\r\n\x85\u2028\u2029-?:,[]{}#&*!|>\'\"%@`'  \
734                or (self.reader.peek(1) not in u'\0 \t\r\n\x85\u2028\u2029'
735                        and (ch == '-' or (not self.flow_level and ch in u'?:')))
736
[43]737    # Scanners.
738
739    def scan_to_next_token(self):
[47]740        # We ignore spaces, line breaks and comments.
741        # If we find a line break in the block context, we set the flag
742        # `allow_simple_key` on.
[51]743        # The byte order mark is stripped if it's the first character in the
744        # stream. We do not yet support BOM inside the stream as the
745        # specification requires. Any such mark will be considered as a part
746        # of the document.
[52]747        #
748        # TODO: We need to make tab handling rules more sane. A good rule is
749        #   Tabs cannot precede tokens
750        #   BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END,
751        #   KEY(block), VALUE(block), BLOCK-ENTRY
752        # So the checking code is
753        #   if <TAB>:
754        #       self.allow_simple_keys = False
755        # We also need to add the check for `allow_simple_keys == True` to
756        # `unwind_indent` before issuing BLOCK-END.
757        # Scanners for block, flow, and plain scalars need to be modified.
758
[51]759        if self.reader.index == 0 and self.reader.peek() == u'\uFEFF':
760            self.reader.forward()
[43]761        found = False
762        while not found:
[46]763            while self.reader.peek() == u' ':
764                self.reader.forward()
765            if self.reader.peek() == u'#':
[47]766                while self.reader.peek() not in u'\0\r\n\x85\u2028\u2029':
[46]767                    self.reader.forward()
[47]768            if self.scan_line_break():
[43]769                if not self.flow_level:
770                    self.allow_simple_key = True
[37]771            else:
[43]772                found = True
[37]773
[43]774    def scan_directive(self):
[48]775        # See the specification for details.
776        start_marker = self.reader.get_marker()
777        self.reader.forward()
778        name = self.scan_directive_name(start_marker)
779        value = None
780        if name == u'YAML':
781            value = self.scan_yaml_directive_value(start_marker)
782            end_marker = self.reader.get_marker()
783        elif name == u'TAG':
784            value = self.scan_tag_directive_value(start_marker)
785            end_marker = self.reader.get_marker()
[43]786        else:
[48]787            end_marker = self.reader.get_marker()
788            while self.reader.peek() not in u'\0\r\n\x85\u2028\u2029':
789                self.reader.forward()
790        self.scan_directive_ignored_line(start_marker)
791        return DirectiveToken(name, value, start_marker, end_marker)
792
793    def scan_directive_name(self, start_marker):
794        # See the specification for details.
795        length = 0
796        ch = self.reader.peek(length)
797        while u'0' <= ch <= u'9' or u'A' <= ch <= 'Z' or u'a' <= ch <= 'z'  \
798                or ch in u'-_':
799            length += 1
800            ch = self.reader.peek(length)
801        if not length:
802            raise ScannerError("while scanning a directive", start_marker,
[52]803                    "expected alphabetic or numeric character, but found %r"
804                    % ch.encode('utf-8'), self.reader.get_marker())
[48]805        value = self.reader.prefix(length)
806        self.reader.forward(length)
807        ch = self.reader.peek()
808        if ch not in u'\0 \r\n\x85\u2028\u2029':
[52]809            raise ScannerError("while scanning a directive", start_marker,
[48]810                    "expected alphabetic or numeric character, but found %r"
811                    % ch.encode('utf-8'), self.reader.get_marker())
812        return value
813
814    def scan_yaml_directive_value(self, start_marker):
815        # See the specification for details.
816        while self.reader.peek() == u' ':
[46]817            self.reader.forward()
[48]818        major = self.scan_yaml_directive_number(start_marker)
819        if self.reader.peek() != '.':
820            raise ScannerError("while scanning a directive", start_marker,
[52]821                    "expected a digit or '.', but found %r"
822                    % self.reader.peek().encode('utf-8'),
[48]823                    self.reader.get_marker())
[46]824        self.reader.forward()
[48]825        minor = self.scan_yaml_directive_number(start_marker)
826        if self.reader.peek() not in u'\0 \r\n\x85\u2028\u2029':
827            raise ScannerError("while scanning a directive", start_marker,
[52]828                    "expected a digit or ' ', but found %r"
829                    % self.reader.peek().encode('utf-8'),
[48]830                    self.reader.get_marker())
831        return (major, minor)
[37]832
[48]833    def scan_yaml_directive_number(self, start_marker):
834        # See the specification for details.
835        ch = self.reader.peek()
836        if not (u'0' <= ch <= '9'):
837            raise ScannerError("while scanning a directive", start_marker,
838                    "expected a digit, but found %r" % ch.encode('utf-8'),
839                    self.reader.get_marker())
840        length = 0
841        while u'0' <= self.reader.peek(length) <= u'9':
842            length += 1
843        value = int(self.reader.prefix(length))
844        self.reader.forward(length)
845        return value
846
847    def scan_tag_directive_value(self, start_marker):
848        # See the specification for details.
849        while self.reader.peek() == u' ':
850            self.reader.forward()
851        handle = self.scan_tag_directive_handle(start_marker)
852        while self.reader.peek() == u' ':
853            self.reader.forward()
854        prefix = self.scan_tag_directive_prefix(start_marker)
855        return (handle, prefix)
856
857    def scan_tag_directive_handle(self, start_marker):
858        # See the specification for details.
859        value = self.scan_tag_handle('directive', start_marker)
[52]860        ch = self.reader.peek()
861        if ch != u' ':
[48]862            raise ScannerError("while scanning a directive", start_marker,
863                    "expected ' ', but found %r" % ch.encode('utf-8'),
864                    self.reader.get_marker())
865        return value
866
867    def scan_tag_directive_prefix(self, start_marker):
868        # See the specification for details.
869        value = self.scan_tag_uri('directive', start_marker)
870        ch = self.reader.peek()
871        if ch not in u'\0 \r\n\x85\u2028\u2029':
872            raise ScannerError("while scanning a directive", start_marker,
873                    "expected ' ', but found %r" % ch.encode('utf-8'),
874                    self.reader.get_marker())
875        return value
876
877    def scan_directive_ignored_line(self, start_marker):
878        # See the specification for details.
879        while self.reader.peek() == u' ':
880            self.reader.forward()
881        if self.reader.peek() == u'#':
882            while self.reader.peek() not in u'\0\r\n\x85\u2028\u2029':
883                self.reader.forward()
884        ch = self.reader.peek()
885        if ch not in u'\0\r\n\x85\u2028\u2029':
886            raise ScannerError("while scanning a directive", start_marker,
887                    "expected a comment or a line break, but found %r"
888                        % ch.encode('utf-8'), self.reader.get_marker())
889        self.scan_line_break()
890
[43]891    def scan_anchor(self, TokenClass):
[48]892        # The specification does not restrict characters for anchors and
893        # aliases. This may lead to problems, for instance, the document:
894        #   [ *alias, value ]
895        # can be interpteted in two ways, as
896        #   [ "value" ]
897        # and
898        #   [ *alias , "value" ]
899        # Therefore we restrict aliases to numbers and ASCII letters.
[46]900        start_marker = self.reader.get_marker()
[48]901        indicator = self.reader.peek()
902        if indicator == '*':
903            name = 'alias'
904        else:
905            name = 'anchor'
906        self.reader.forward()
907        length = 0
908        ch = self.reader.peek(length)
909        while u'0' <= ch <= u'9' or u'A' <= ch <= 'Z' or u'a' <= ch <= 'z'  \
910                or ch in u'-_':
911            length += 1
912            ch = self.reader.peek(length)
913        if not length:
914            raise ScannerError("while scanning an %s" % name, start_marker,
[52]915                    "expected alphabetic or numeric character, but found %r"
916                    % ch.encode('utf-8'), self.reader.get_marker())
[48]917        value = self.reader.prefix(length)
918        self.reader.forward(length)
919        ch = self.reader.peek()
920        if ch not in u'\0 \t\r\n\x85\u2028\u2029?:,]}%@`':
921            raise ScannerError("while scanning an %s" % name, start_marker,
922                    "expected alphabetic or numeric character, but found %r"
923                    % ch.encode('utf-8'), self.reader.get_marker())
[46]924        end_marker = self.reader.get_marker()
[48]925        return TokenClass(value, start_marker, end_marker)
[37]926
[43]927    def scan_tag(self):
[48]928        # See the specification for details.
[46]929        start_marker = self.reader.get_marker()
[48]930        ch = self.reader.peek(1)
931        if ch == u'<':
932            handle = None
933            self.reader.forward(2)
934            suffix = self.scan_tag_uri('tag', start_marker)
935            if self.reader.peek() != u'>':
[52]936                raise ScannerError("while parsing a tag", start_marker,
937                        "expected '>', but found %r" % self.reader.peek().encode('utf-8'),
[48]938                        self.reader.get_marker())
[46]939            self.reader.forward()
[48]940        elif ch in u'\0 \t\r\n\x85\u2028\u2029':
941            handle = None
942            suffix = u'!'
943            self.reader.forward()
944        else:
945            length = 1
946            use_handle = False
947            while ch not in u'\0 \r\n\x85\u2028\u2029':
948                if ch == u'!':
949                    use_handle = True
950                    break
951                length += 1
952                ch = self.reader.peek(length)
953            handle = u'!'
954            if use_handle:
955                handle = self.scan_tag_handle('tag', start_marker)
956            else:
957                handle = u'!'
958                self.reader.forward()
959            suffix = self.scan_tag_uri('tag', start_marker)
960        ch = self.reader.peek()
961        if ch not in u'\0 \r\n\x85\u2028\u2029':
962            raise ScannerError("while scanning a tag", start_marker,
963                    "expected ' ', but found %r" % ch.encode('utf-8'),
964                    self.reader.get_marker())
965        value = (handle, suffix)
[46]966        end_marker = self.reader.get_marker()
[48]967        return TagToken(value, start_marker, end_marker)
[43]968
969    def scan_block_scalar(self, folded):
[48]970        # See the specification for details.
971
972        chunks = []
[46]973        start_marker = self.reader.get_marker()
[48]974
975        # Scan the header.
976        self.reader.forward()
977        chomping, increment = self.scan_block_scalar_indicators(start_marker)
978        self.scan_block_scalar_ignored_line(start_marker)
979
980        # Determine the indentation level and go to the first non-empty line.
981        min_indent = self.indent+1
982        if min_indent < 1:
983            min_indent = 1
984        if increment is None:
985            breaks, max_indent, end_marker = self.scan_block_scalar_indentation()
986            indent = max(min_indent, max_indent)
987        else:
988            indent = min_indent+increment-1
989            breaks, end_marker = self.scan_block_scalar_breaks(indent)
990        line_break = u''
991
992        # Scan the inner part of the block scalar.
993        while self.reader.column == indent and self.reader.peek() != u'\0':
994            chunks.extend(breaks)
995            leading_non_space = self.reader.peek() not in u' \t'
996            length = 0
997            while self.reader.peek(length) not in u'\0\r\n\x85\u2028\u2029':
998                length += 1
999            chunks.append(self.reader.prefix(length))
1000            self.reader.forward(length)
1001            line_break = self.scan_line_break()
1002            breaks, end_marker = self.scan_block_scalar_breaks(indent)
1003            if self.reader.column == indent and self.reader.peek() != u'\0':
1004                # Unfortunately, folding rules are ambiguous.
1005                #
1006                # This is the folding according to the specification:
[51]1007               
1008                if folded and line_break == u'\n'   \
1009                        and leading_non_space and self.reader.peek() not in u' \t':
1010                    if not breaks:
1011                        chunks.append(u' ')
1012                else:
1013                    chunks.append(line_break)
1014               
1015                # This is Clark Evans's interpretation (also in the spec
1016                # examples):
[48]1017                #
[51]1018                #if folded and line_break == u'\n':
[48]1019                #    if not breaks:
[51]1020                #        if self.reader.peek() not in ' \t':
1021                #            chunks.append(u' ')
1022                #        else:
1023                #            chunks.append(line_break)
[48]1024                #else:
1025                #    chunks.append(line_break)
1026            else:
1027                break
1028
1029        # Chomp the tail.
1030        if chomping is not False:
1031            chunks.append(line_break)
1032        if chomping is True:
1033            chunks.extend(breaks)
1034
1035        # We are done.
1036        return ScalarToken(u''.join(chunks), False, start_marker, end_marker)
1037
1038    def scan_block_scalar_indicators(self, start_marker):
1039        # See the specification for details.
1040        chomping = None
1041        increment = None
1042        ch = self.reader.peek()
1043        if ch in u'+-':
1044            if ch == '+':
1045                chomping = True
1046            else:
1047                chomping = False
1048            self.reader.forward()
1049            ch = self.reader.peek()
1050            if ch in u'0123456789':
1051                increment = int(ch)
1052                if increment == 0:
1053                    raise ScannerError("while scanning a block scalar", start_marker,
1054                            "expected indentation indicator in the range 1-9, but found 0",
1055                            self.reader.get_marker())
[46]1056                self.reader.forward()
[48]1057        elif ch in u'0123456789':
1058            increment = int(ch)
1059            if increment == 0:
1060                raise ScannerError("while scanning a block scalar", start_marker,
1061                        "expected indentation indicator in the range 1-9, but found 0",
1062                        self.reader.get_marker())
1063            self.reader.forward()
1064            ch = self.reader.peek()
1065            if ch in u'+-':
1066                if ch == '+':
1067                    chomping = True
1068                else:
1069                    chomping = False
[46]1070                self.reader.forward()
[48]1071        ch = self.reader.peek()
1072        if ch not in u'\0 \r\n\x85\u2028\u2029':
1073            raise ScannerError("while scanning a block scalar", start_marker,
1074                    "expected chomping or indentation indicators, but found %r"
1075                        % ch.encode('utf-8'), self.reader.get_marker())
1076        return chomping, increment
1077
1078    def scan_block_scalar_ignored_line(self, start_marker):
1079        # See the specification for details.
1080        while self.reader.peek() == u' ':
1081            self.reader.forward()
1082        if self.reader.peek() == u'#':
1083            while self.reader.peek() not in u'\0\r\n\x85\u2028\u2029':
[46]1084                self.reader.forward()
[48]1085        ch = self.reader.peek()
1086        if ch not in u'\0\r\n\x85\u2028\u2029':
1087            raise ScannerError("while scanning a block scalar", start_marker,
1088                    "expected a comment or a line break, but found %r"
1089                        % ch.encode('utf-8'), self.reader.get_marker())
1090        self.scan_line_break()
[43]1091
[48]1092    def scan_block_scalar_indentation(self):
1093        # See the specification for details.
1094        chunks = []
1095        max_indent = 0
1096        end_marker = self.reader.get_marker()
1097        while self.reader.peek() in u' \r\n\x85\u2028\u2029':
1098            if self.reader.peek() != u' ':
1099                chunks.append(self.scan_line_break())
1100                end_marker = self.reader.get_marker()
1101            else:
1102                self.reader.forward()
1103                if self.reader.column > max_indent:
1104                    max_indent = self.reader.column
1105        return chunks, max_indent, end_marker
1106
1107    def scan_block_scalar_breaks(self, indent):
1108        # See the specification for details.
1109        chunks = []
1110        end_marker = self.reader.get_marker()
1111        while self.reader.column < indent and self.reader.peek() == u' ':
1112            self.reader.forward()
1113        while self.reader.peek() in u'\r\n\x85\u2028\u2029':
1114            chunks.append(self.scan_line_break())
1115            end_marker = self.reader.get_marker()
1116            while self.reader.column < indent and self.reader.peek() == u' ':
1117                self.reader.forward()
1118        return chunks, end_marker
1119
[43]1120    def scan_flow_scalar(self, double):
[48]1121        # See the specification for details.
1122        chunks = []
1123        start_marker = self.reader.get_marker()
1124        indent = self.indent+1
1125        if indent == 0:
1126            indent = 1
[46]1127        quote = self.reader.peek()
1128        self.reader.forward()
[48]1129        chunks.extend(self.scan_flow_scalar_non_spaces(double, indent, start_marker))
[46]1130        while self.reader.peek() != quote:
[48]1131            chunks.extend(self.scan_flow_scalar_spaces(double, indent, start_marker))
1132            chunks.extend(self.scan_flow_scalar_non_spaces(double, indent, start_marker))
1133        self.reader.forward()
1134        end_marker = self.reader.get_marker()
1135        return ScalarToken(u''.join(chunks), False, start_marker, end_marker)
1136
1137    ESCAPE_REPLACEMENTS = {
1138        u'0':   u'\0',
1139        u'a':   u'\x07',
1140        u'b':   u'\x08',
1141        u't':   u'\x09',
1142        u'\t':  u'\x09',
1143        u'n':   u'\x0A',
1144        u'v':   u'\x0B',
1145        u'f':   u'\x0C',
1146        u'r':   u'\x0D',
1147        u'e':   u'\x1B',
1148        u' ':   u'\x20',
1149        u'\"':  u'\"',
1150        u'\\':  u'\\',
1151        u'N':   u'\x85',
1152        u'_':   u'\xA0',
1153        u'L':   u'\u2028',
1154        u'P':   u'\u2029',
1155    }
1156
1157    ESCAPE_CODES = {
1158        u'x':   2,
1159        u'u':   4,
1160        u'U':   8,
1161    }
1162
1163    def scan_flow_scalar_non_spaces(self, double, indent, start_marker):
1164        # See the specification for details.
1165        chunks = []
1166        while True:
1167            length = 0
1168            while self.reader.peek(length) not in u'\'\"\\\0 \t\r\n\x85\u2028\u2029':
1169                length += 1
1170            if length:
1171                chunks.append(self.reader.prefix(length))
1172                self.reader.forward(length)
1173            ch = self.reader.peek()
1174            if not double and ch == u'\'' and self.reader.peek(1) == u'\'':
1175                chunks.append(u'\'')
[46]1176                self.reader.forward(2)
[48]1177            elif (double and ch == u'\'') or (not double and ch in u'\"\\'):
1178                chunks.append(ch)
1179                self.reader.forward()
1180            elif double and ch == u'\\':
1181                self.reader.forward()
1182                ch = self.reader.peek()
1183                if ch in self.ESCAPE_REPLACEMENTS:
1184                    chunks.append(self.ESCAPE_REPLACEMENTS[ch])
1185                    self.reader.forward()
1186                elif ch in self.ESCAPE_CODES:
1187                    length = self.ESCAPE_CODES[ch]
1188                    self.reader.forward()
1189                    for k in range(length):
1190                        if self.reader.peek(k) not in u'0123456789ABCDEFabcdef':
1191                            raise ScannerError("while scanning a double-quoted scalar", start_marker,
1192                                    "expected escape sequence of %d hexdecimal numbers, but found %r" %
1193                                        (length, self.reader.peek(k).encode('utf-8')), self.reader.get_marker())
1194                    code = int(self.reader.prefix(length), 16)
1195                    chunks.append(unichr(code))
1196                    self.reader.forward(length)
1197                elif ch in u'\r\n\x85\u2028\u2029':
1198                    self.scan_line_break()
1199                    chunks.extend(self.scan_flow_scalar_breaks(double, indent, start_marker))
1200                else:
1201                    raise ScannerError("while scanning a double-quoted scalar", start_marker,
1202                            "found unknown escape character %r" % ch.encode('utf-8'), self.reader.get_marker())
[37]1203            else:
[48]1204                return chunks
[37]1205
[48]1206    def scan_flow_scalar_spaces(self, double, indent, start_marker):
1207        # See the specification for details.
1208        chunks = []
1209        length = 0
1210        while self.reader.peek(length) in u' \t':
1211            length += 1
1212        whitespaces = self.reader.prefix(length)
1213        self.reader.forward(length)
1214        ch = self.reader.peek()
1215        if ch == u'\0':
1216            raise ScannerError("while scanning a quoted scalar", start_marker,
1217                    "found unexpected end of stream", self.reader.get_marker())
1218        elif ch in u'\r\n\x85\u2028\u2029':
1219            line_break = self.scan_line_break()
1220            breaks = self.scan_flow_scalar_breaks(double, indent, start_marker)
1221            if line_break != u'\n':
1222                chunks.append(line_break)
1223            elif not breaks:
1224                chunks.append(u' ')
1225            chunks.extend(breaks)
1226        else:
1227            chunks.append(whitespaces)
1228        return chunks
1229
1230    def scan_flow_scalar_breaks(self, double, indent, start_marker):
1231        # See the specification for details.
1232        chunks = []
1233        while True:
1234            while self.reader.column < indent and self.reader.peek() == u' ':
1235                self.reader.forward()
1236            if self.reader.column < indent  \
1237                    and self.reader.peek() not in u'\0\r\n\x85\u2028\u2029':
1238                s = 's'
1239                if indent == 1:
1240                    s = ''
1241                raise ScannerError("while scanning a quoted scalar", start_marker,
1242                        "expected %d space%s indentation, but found %r"
1243                        % (indent, s, self.reader.peek().encode('utf-8')),
1244                        self.reader.get_marker())
1245            while self.reader.peek() in u' \t':
1246                self.reader.forward()
1247            if self.reader.peek() in u'\r\n\x85\u2028\u2029':
1248                chunks.append(self.scan_line_break())
1249            else:
1250                return chunks
1251
[43]1252    def scan_plain(self):
[48]1253        # See the specification for details.
1254        # We add an additional restriction for the flow context:
1255        #   plain scalars in the flow context cannot contain ':' and '?'.
1256        # We also keep track of the `allow_simple_key` flag here.
1257        chunks = []
1258        start_marker = self.reader.get_marker()
1259        end_marker = start_marker
[43]1260        indent = self.indent+1
[48]1261        if indent == 0:
[43]1262            indent = 1
[48]1263        spaces = []
[43]1264        while True:
[48]1265            length = 0
1266            if self.reader.peek() == u'#':
[43]1267                break
[48]1268            while True:
1269                ch = self.reader.peek(length)
1270                if ch in u'\0 \t\r\n\x85\u2028\u2029'   \
1271                        or (not self.flow_level and ch == u':' and
1272                                self.reader.peek(length+1) in u'\0 \t\r\n\x28\u2028\u2029') \
1273                        or (self.flow_level and ch in u',:?[]{}'):
1274                    break
1275                length += 1
1276            if length == 0:
[43]1277                break
[48]1278            self.allow_simple_key = False
1279            chunks.extend(spaces)
1280            chunks.append(self.reader.prefix(length))
1281            self.reader.forward(length)
1282            end_marker = self.reader.get_marker()
1283            spaces = self.scan_plain_spaces(indent)
1284            if not spaces or self.reader.peek() == u'#' \
1285                    or self.reader.column < indent:
1286                break
1287        return ScalarToken(u''.join(chunks), True, start_marker, end_marker)
[37]1288
[48]1289    def scan_plain_spaces(self, indent):
1290        # See the specification for details.
1291        # The specification is really confusing about tabs in plain scalars.
1292        # We just forbid them completely. Do not use tabs in YAML!
1293        chunks = []
1294        length = 0
1295        while self.reader.peek(length) in u' ':
1296            length += 1
1297        whitespaces = self.reader.prefix(length)
1298        self.reader.forward(length)
1299        ch = self.reader.peek()
1300        if ch in u'\r\n\x85\u2028\u2029':
1301            line_break = self.scan_line_break()
1302            self.allow_simple_key = True
1303            breaks = []
1304            while self.reader.peek() in u' \r\n\x85\u2028\u2029':
1305                if self.reader.peek() == ' ':
1306                    self.reader.forward()
1307                else:
1308                    breaks.append(self.scan_line_break())
1309            if line_break != u'\n':
1310                chunks.append(line_break)
1311            elif not breaks:
1312                chunks.append(u' ')
1313            chunks.extend(breaks)
1314        elif whitespaces:
1315            chunks.append(whitespaces)
1316        return chunks
1317
1318    def scan_tag_handle(self, name, start_marker):
1319        # See the specification for details.
1320        # For some strange reasons, the specification does not allow '_' in
1321        # tag handles. I have allowed it anyway.
[52]1322        ch = self.reader.peek()
1323        if ch != u'!':
[48]1324            raise ScannerError("while scanning a %s" % name, start_marker,
1325                    "expected '!', but found %r" % ch.encode('utf-8'),
1326                    self.reader.get_marker())
1327        length = 1
1328        ch = self.reader.peek(length)
1329        if ch != u' ':
1330            while u'0' <= ch <= u'9' or u'A' <= ch <= 'Z' or u'a' <= ch <= 'z'  \
1331                    or ch in u'-_':
1332                length += 1
1333                ch = self.reader.peek(length)
1334            if ch != u'!':
1335                self.reader.forward(length)
1336                raise ScannerError("while scanning a %s" % name, start_marker,
1337                        "expected '!', but found %r" % ch.encode('utf-8'),
1338                        self.reader.get_marker())
1339            length += 1
1340        value = self.reader.prefix(length)
1341        self.reader.forward(length)
1342        return value
1343
1344    def scan_tag_uri(self, name, start_marker):
1345        # See the specification for details.
1346        # Note: we do not check if URI is well-formed.
1347        chunks = []
1348        length = 0
1349        ch = self.reader.peek(length)
1350        while u'0' <= ch <= u'9' or u'A' <= ch <= 'Z' or u'a' <= ch <= 'z'  \
1351                or ch in u'-;/?:@&=+$,_.!~*\'()[]%':
1352            if ch == u'%':
1353                chunks.append(self.reader.prefix(length))
1354                self.reader.forward(length)
1355                length = 0
1356                chunks.append(self.scan_uri_escapes(name, start_marker))
1357            else:
1358                length += 1
1359            ch = self.reader.peek(length)
1360        if length:
1361            chunks.append(self.reader.prefix(length))
1362            self.reader.forward(length)
1363            length = 0
1364        if not chunks:
1365            raise ScannerError("while parsing a %s" % name, start_marker,
1366                    "expected URI, but found %r" % ch.encode('utf-8'),
1367                    self.reader.get_marker())
1368        return u''.join(chunks)
1369
1370    def scan_uri_escapes(self, name, start_marker):
1371        # See the specification for details.
1372        bytes = []
1373        marker = self.reader.get_marker()
1374        while self.reader.peek() == u'%':
1375            self.reader.forward()
1376            for k in range(2):
1377                if self.reader.peek(k) not in u'0123456789ABCDEFabcdef':
1378                    raise ScannerError("while scanning a %s" % name, start_marker,
1379                            "expected URI escape sequence of 2 hexdecimal numbers, but found %r" %
1380                                (self.reader.peek(k).encode('utf-8')), self.reader.get_marker())
1381            bytes.append(chr(int(self.reader.prefix(2), 16)))
1382            self.reader.forward(2)
1383        try:
1384            value = unicode(''.join(bytes), 'utf-8')
1385        except UnicodeDecodeError, exc:
1386            raise ScannerError("while scanning a %s" % name, start_marker, str(exc), marker)
1387        return value
1388
[47]1389    def scan_line_break(self):
1390        # Transforms:
1391        #   '\r\n'      :   '\n'
1392        #   '\r'        :   '\n'
1393        #   '\n'        :   '\n'
1394        #   '\x85'      :   '\n'
1395        #   '\u2028'    :   '\u2028'
1396        #   '\u2029     :   '\u2029'
1397        #   default     :   ''
1398        ch = self.reader.peek()
1399        if ch in u'\r\n\x85':
[48]1400            if self.reader.prefix(2) == u'\r\n':
[60]1401                self.reader.forward(2)
[47]1402            else:
1403                self.reader.forward()
1404            return u'\n'
1405        elif ch in u'\u2028\u2029':
1406            self.reader.forward()
1407            return ch
1408        return u''
1409
[45]1410#try:
1411#    import psyco
1412#    psyco.bind(Scanner)
1413#except ImportError:
1414#    pass
1415
Note: See TracBrowser for help on using the repository browser.