source: pyyaml/trunk/lib/yaml/parser.py @ 116

Revision 116, 20.1 KB checked in by xi, 9 years ago (diff)

Back to work :). Rename markers to marks.

RevLine 
[43]1
[51]2# YAML can be parsed by an LL(1) parser!
3#
4# We use the following production rules:
5# stream            ::= implicit_document? explicit_document* STREAM-END
[43]6# explicit_document ::= DIRECTIVE* DOCUMENT-START block_node? DOCUMENT-END?
7# implicit_document ::= block_node DOCUMENT-END?
8# block_node    ::= ALIAS | properties? block_content
9# flow_node     ::= ALIAS | properties? flow_content
10# properties    ::= TAG ANCHOR? | ANCHOR TAG?
11# block_content     ::= block_collection | flow_collection | SCALAR
12# flow_content      ::= flow_collection | SCALAR
13# block_collection  ::= block_sequence | block_mapping
[51]14# block_sequence    ::= BLOCK-SEQUENCE-START (BLOCK-ENTRY block_node?)* BLOCK-END
[43]15# block_mapping     ::= BLOCK-MAPPING_START ((KEY block_node_or_indentless_sequence?)? (VALUE block_node_or_indentless_sequence?)?)* BLOCK-END
16# block_node_or_indentless_sequence ::= ALIAS | properties? (block_content | indentless_block_sequence)
[51]17# indentless_block_sequence         ::= (BLOCK-ENTRY block_node?)+
[43]18# flow_collection   ::= flow_sequence | flow_mapping
[51]19# flow_sequence     ::= FLOW-SEQUENCE-START (flow_sequence_entry FLOW-ENTRY)* flow_sequence_entry? FLOW-SEQUENCE-END
20# flow_mapping      ::= FLOW-MAPPING-START (flow_mapping_entry FLOW-ENTRY)* flow_mapping_entry? FLOW-MAPPING-END
21# flow_sequence_entry   ::= flow_node | KEY flow_node? (VALUE flow_node?)?
22# flow_mapping_entry    ::= flow_node | KEY flow_node? (VALUE flow_node?)?
[52]23
24# TODO: support for BOM within a stream.
25# stream ::= (BOM? implicit_document)? (BOM? explicit_document)* STREAM-END
26
[51]27# Note that there is a slight deviation from the specification. We require a
28# non-empty node content if ANCHOR or TAG is specified. This disallow such
29# documents as
30#
31#   key:    !!str   # empty value
32#
33# This is done to prevent ambiguity in parsing tags and aliases:
34#
35#   {   !!perl/YAML::Parser:    value }
36#
37# What is it? Should it be interpreted as
38#   {   ? !<tag:yaml.org,2002:perl/YAML::Parser> '' : value }
39# or
40#   {   ? !<tag:yaml.org,2002:perl/YAML::Parser:> value : '' }
41# Since we disallow non-empty node content, tags are always followed by spaces
42# or line breaks.
[43]43
[51]44# FIRST sets:
45# stream: FIRST(block_node) + { DIRECTIVE DOCUMENT-START }
[43]46# explicit_document: { DIRECTIVE DOCUMENT-START }
[51]47# implicit_document: FIRST(block_node)
[43]48# block_node: { ALIAS TAG ANCHOR SCALAR BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START }
49# flow_node: { ALIAS ANCHOR TAG SCALAR FLOW-SEQUENCE-START FLOW-MAPPING-START }
50# block_content: { BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START SCALAR }
51# flow_content: { FLOW-SEQUENCE-START FLOW-MAPPING-START SCALAR }
52# block_collection: { BLOCK-SEQUENCE-START BLOCK-MAPPING-START }
53# flow_collection: { FLOW-SEQUENCE-START FLOW-MAPPING-START }
54# block_sequence: { BLOCK-SEQUENCE-START }
55# block_mapping: { BLOCK-MAPPING-START }
[51]56# block_node_or_indentless_sequence: { ALIAS ANCHOR TAG SCALAR BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START BLOCK-ENTRY }
[43]57# indentless_sequence: { ENTRY }
58# flow_collection: { FLOW-SEQUENCE-START FLOW-MAPPING-START }
59# flow_sequence: { FLOW-SEQUENCE-START }
60# flow_mapping: { FLOW-MAPPING-START }
61# flow_sequence_entry: { ALIAS ANCHOR TAG SCALAR FLOW-SEQUENCE-START FLOW-MAPPING-START KEY }
62# flow_mapping_entry: { ALIAS ANCHOR TAG SCALAR FLOW-SEQUENCE-START FLOW-MAPPING-START KEY }
63
[57]64__all__ = ['Parser', 'ParserError']
65
[52]66from error import MarkedYAMLError
[46]67from tokens import *
[51]68from events import *
[44]69
[52]70class ParserError(MarkedYAMLError):
71    pass
[44]72
[51]73class Parser:
74    # Since writing an LL(1) parser is a straightforward task, we do not give
75    # many comments here.
[55]76    # Note that we use Python generators. If you rewrite the parser in another
[51]77    # language, you may replace all 'yield'-s with event handler calls.
[44]78
[51]79    DEFAULT_TAGS = {
80        u'!':   u'!',
81        u'!!':  u'tag:yaml.org,2002:',
82    }
[44]83
[46]84    def __init__(self, scanner):
85        self.scanner = scanner
[51]86        self.current_event = None
87        self.yaml_version = None
88        self.tag_handles = {}
89        self.event_generator = self.parse_stream()
[43]90
[51]91    def check(self, *choices):
92        # Check the type of the next event.
93        if self.current_event is None:
94            try:
95                self.current_event = self.event_generator.next()
96            except StopIteration:
97                pass
98        if self.current_event is not None:
99            for choice in choices:
100                if isinstance(self.current_event, choice):
101                    return True
[44]102        return False
103
[53]104    def peek(self):
105        # Get the next event.
106        if self.current_event is None:
107            try:
108                self.current_event = self.event_generator.next()
109            except StopIteration:
110                pass
111        return self.current_event
112
[51]113    def get(self):
114        # Get the next event.
115        if self.current_event is None:
116            try:
117                self.current_event = self.event_generator.next()
118            except StopIteration:
119                pass
120        value = self.current_event
121        self.current_event = None
122        return value
[44]123
[51]124    def __iter__(self):
125        # Iterator protocol.
126        return self.event_generator
[44]127
[43]128    def parse_stream(self):
[51]129        # implicit_document? explicit_document* STREAM-END
130
131        # Parse implicit document.
132        if not self.scanner.check(DirectiveToken, DocumentStartToken,
133                StreamEndToken):
134            self.tag_handles = self.DEFAULT_TAGS
135            for event in self.parse_block_node():
136                yield event
137
138        # Parse explicit documents.
139        while not self.scanner.check(StreamEndToken):
140            self.process_directives()
141            if not self.scanner.check(DocumentStartToken):
142                raise ParserError(None, None,
143                        "expected '<document start>', but found %r"
144                        % self.scanner.peek().id,
[116]145                        self.scanner.peek().start_mark)
[51]146            token = self.scanner.get()
147            if self.scanner.check(DirectiveToken,
[47]148                    DocumentStartToken, DocumentEndToken, StreamEndToken):
[116]149                yield self.process_empty_scalar(token.end_mark)
[43]150            else:
[51]151                for event in self.parse_block_node():
152                    yield event
153            while self.scanner.check(DocumentEndToken):
154                self.scanner.get()
[43]155
[51]156        # Parse end of stream.
157        token = self.scanner.get()
[116]158        yield StreamEndEvent(token.start_mark, token.end_mark)
[51]159
160    def process_directives(self):
161        # DIRECTIVE*
162        self.yaml_version = None
163        self.tag_handles = {}
164        while self.scanner.check(DirectiveToken):
165            token = self.scanner.get()
166            if token.name == u'YAML':
167                if self.yaml_version is not None:
168                    raise ParserError(None, None,
[116]169                            "found duplicate YAML directive", token.start_mark)
[51]170                major, minor = token.value
171                if major != 1:
172                    raise ParserError(None, None,
173                            "found incompatible YAML document (version 1.* is required)",
[116]174                            token.start_mark)
[51]175                self.yaml_version = token.value
176            elif token.name == u'TAG':
177                handle, prefix = token.value
178                if handle in self.tag_handles:
179                    raise ParserError(None, None,
180                            "duplicate tag handle %r" % handle.encode('utf-8'),
[116]181                            token.start_mark)
[51]182                self.tag_handles[handle] = prefix
183        for key in self.DEFAULT_TAGS:
184            if key not in self.tag_handles:
185                self.tag_handles[key] = self.DEFAULT_TAGS[key]
186
[43]187    def parse_block_node(self):
[44]188        return self.parse_node(block=True)
[43]189
190    def parse_flow_node(self):
[44]191        return self.parse_node()
[43]192
193    def parse_block_node_or_indentless_sequence(self):
[44]194        return self.parse_node(block=True, indentless_sequence=True)
[43]195
[44]196    def parse_node(self, block=False, indentless_sequence=False):
[51]197        # block_node    ::= ALIAS | properties? block_content
198        # flow_node     ::= ALIAS | properties? flow_content
199        # properties    ::= TAG ANCHOR? | ANCHOR TAG?
200        # block_content     ::= block_collection | flow_collection | SCALAR
201        # flow_content      ::= flow_collection | SCALAR
202        # block_collection  ::= block_sequence | block_mapping
203        # block_node_or_indentless_sequence ::= ALIAS | properties?
204        #                                       (block_content | indentless_block_sequence)
205        if self.scanner.check(AliasToken):
206            token = self.scanner.get()
[116]207            yield AliasEvent(token.value, token.start_mark, token.end_mark)
[44]208        else:
[51]209            anchor = None
210            tag = None
[116]211            start_mark = end_mark = tag_mark = None
[51]212            if self.scanner.check(AnchorToken):
213                token = self.scanner.get()
[116]214                start_mark = end_mark = token.start_mark
[51]215                anchor = token.value
216                if self.scanner.check(TagToken):
217                    token = self.scanner.get()
[116]218                    end_mark = tag_mark = token.start_mark
[51]219                    tag = token.value
220            elif self.scanner.check(TagToken):
221                token = self.scanner.get()
[116]222                start_mark = end_mark = tag_mark = token.start_mark
[51]223                tag = token.value
224                if self.scanner.check(AnchorToken):
225                    token = self.scanner.get()
[116]226                    end_mark = token.start_mark
[51]227                    anchor = token.value
228            if tag is not None:
229                handle, suffix = tag
230                if handle is not None:
231                    if handle not in self.tag_handles:
[116]232                        raise ParserError("while parsing a node", start_mark,
[51]233                                "found undefined tag handle %r" % handle.encode('utf-8'),
[116]234                                tag_mark)
[51]235                    tag = self.tag_handles[handle]+suffix
236                else:
237                    tag = suffix
238            if tag is None:
239                if not (self.scanner.check(ScalarToken) and
240                        self.scanner.peek().plain):
241                    tag = u'!'
[116]242            if start_mark is None:
243                start_mark = self.scanner.peek().start_mark
[51]244            event = None
245            collection_events = None
246            if indentless_sequence and self.scanner.check(BlockEntryToken):
[116]247                end_mark = self.scanner.peek().end_mark
248                event = SequenceEvent(anchor, tag, start_mark, end_mark)
[51]249                collection_events = self.parse_indentless_sequence()
[44]250            else:
[51]251                if self.scanner.check(ScalarToken):
252                    token = self.scanner.get()
[116]253                    end_mark = token.end_mark
[51]254                    event = ScalarEvent(anchor, tag, token.value,
[116]255                            start_mark, end_mark)
[51]256                elif self.scanner.check(FlowSequenceStartToken):
[116]257                    end_mark = self.scanner.peek().end_mark
258                    event = SequenceEvent(anchor, tag, start_mark, end_mark)
[51]259                    collection_events = self.parse_flow_sequence()
260                elif self.scanner.check(FlowMappingStartToken):
[116]261                    end_mark = self.scanner.peek().end_mark
262                    event = MappingEvent(anchor, tag, start_mark, end_mark)
[51]263                    collection_events = self.parse_flow_mapping()
264                elif block and self.scanner.check(BlockSequenceStartToken):
[116]265                    end_mark = self.scanner.peek().start_mark
266                    event = SequenceEvent(anchor, tag, start_mark, end_mark)
[51]267                    collection_events = self.parse_block_sequence()
268                elif block and self.scanner.check(BlockMappingStartToken):
[116]269                    end_mark = self.scanner.peek().start_mark
270                    event = MappingEvent(anchor, tag, start_mark, end_mark)
[51]271                    collection_events = self.parse_block_mapping()
272                else:
273                    if block:
274                        node = 'block'
275                    else:
276                        node = 'flow'
277                    token = self.scanner.peek()
[116]278                    raise ParserError("while scanning a %s node" % node, start_mark,
[51]279                            "expected the node content, but found %r" % token.id,
[116]280                            token.start_mark)
[51]281            yield event
282            if collection_events is not None:
283                for event in collection_events:
284                    yield event
[44]285
[43]286    def parse_block_sequence(self):
[51]287        # BLOCK-SEQUENCE-START (BLOCK-ENTRY block_node?)* BLOCK-END
288        token = self.scanner.get()
[116]289        start_mark = token.start_mark
[51]290        while self.scanner.check(BlockEntryToken):
291            token = self.scanner.get()
292            if not self.scanner.check(BlockEntryToken, BlockEndToken):
293                for event in self.parse_block_node():
294                    yield event
[43]295            else:
[116]296                yield self.process_empty_scalar(token.end_mark)
[51]297        if not self.scanner.check(BlockEndToken):
298            token = self.scanner.peek()
[116]299            raise ParserError("while scanning a block collection", start_mark,
300                    "expected <block end>, but found %r" % token.id, token.start_mark)
[51]301        token = self.scanner.get()
[116]302        yield CollectionEndEvent(token.start_mark, token.end_mark)
[43]303
304    def parse_indentless_sequence(self):
[51]305        # (BLOCK-ENTRY block_node?)+
306        while self.scanner.check(BlockEntryToken):
307            token = self.scanner.get()
308            if not self.scanner.check(BlockEntryToken,
309                    KeyToken, ValueToken, BlockEndToken):
310                for event in self.parse_block_node():
311                    yield event
[43]312            else:
[116]313                yield self.process_empty_scalar(token.end_mark)
[51]314        token = self.scanner.peek()
[116]315        yield CollectionEndEvent(token.start_mark, token.start_mark)
[43]316
317    def parse_block_mapping(self):
[51]318        # BLOCK-MAPPING_START
319        #   ((KEY block_node_or_indentless_sequence?)?
320        #   (VALUE block_node_or_indentless_sequence?)?)*
321        # BLOCK-END
322        token = self.scanner.get()
[116]323        start_mark = token.start_mark
[51]324        while self.scanner.check(KeyToken, ValueToken):
325            if self.scanner.check(KeyToken):
326                token = self.scanner.get()
327                if not self.scanner.check(KeyToken, ValueToken, BlockEndToken):
328                    for event in self.parse_block_node_or_indentless_sequence():
329                        yield event
330                else:
[116]331                    yield self.process_empty_scalar(token.end_mark)
[51]332            if self.scanner.check(ValueToken):
333                token = self.scanner.get()
334                if not self.scanner.check(KeyToken, ValueToken, BlockEndToken):
335                    for event in self.parse_block_node_or_indentless_sequence():
336                        yield event
337                else:
[116]338                    yield self.process_empty_scalar(token.end_mark)
[51]339            else:
340                token = self.scanner.peek()
[116]341                yield self.process_empty_scalar(token.start_mark)
[51]342        if not self.scanner.check(BlockEndToken):
343            token = self.scanner.peek()
[116]344            raise ParserError("while scanning a block mapping", start_mark,
345                    "expected <block end>, but found %r" % token.id, token.start_mark)
[51]346        token = self.scanner.get()
[116]347        yield CollectionEndEvent(token.start_mark, token.end_mark)
[43]348
349    def parse_flow_sequence(self):
[51]350        # flow_sequence     ::= FLOW-SEQUENCE-START
351        #                       (flow_sequence_entry FLOW-ENTRY)*
352        #                       flow_sequence_entry?
353        #                       FLOW-SEQUENCE-END
354        # flow_sequence_entry   ::= flow_node | KEY flow_node? (VALUE flow_node?)?
355        #
356        # Note that while production rules for both flow_sequence_entry and
357        # flow_mapping_entry are equal, their interpretations are different.
358        # For `flow_sequence_entry`, the part `KEY flow_node? (VALUE flow_node?)?`
359        # generate an inline mapping (set syntax).
360        token = self.scanner.get()
[116]361        start_mark = token.start_mark
[51]362        while not self.scanner.check(FlowSequenceEndToken):
363            if self.scanner.check(KeyToken):
364                token = self.scanner.get()
365                yield MappingEvent(None, u'!',
[116]366                        token.start_mark, token.end_mark)
[51]367                if not self.scanner.check(ValueToken,
368                        FlowEntryToken, FlowSequenceEndToken):
369                    for event in self.parse_flow_node():
370                        yield event
371                else:
[116]372                    yield self.process_empty_scalar(token.end_mark)
[51]373                if self.scanner.check(ValueToken):
374                    token = self.scanner.get()
375                    if not self.scanner.check(FlowEntryToken, FlowSequenceEndToken):
376                        for event in self.parse_flow_node():
377                            yield event
378                    else:
[116]379                        yield self.process_empty_scalar(token.end_mark)
[51]380                else:
381                    token = self.scanner.peek()
[116]382                    yield self.process_empty_scalar(token.start_mark)
[51]383                token = self.scanner.peek()
[116]384                yield CollectionEndEvent(token.start_mark, token.start_mark)
[43]385            else:
[51]386                for event in self.parse_flow_node():
387                    yield event
388            if not self.scanner.check(FlowEntryToken, FlowSequenceEndToken):
389                token = self.scanner.peek()
[116]390                raise ParserError("while scanning a flow sequence", start_mark,
391                        "expected ',' or ']', but got %r" % token.id, token.start_mark)
[51]392            if self.scanner.check(FlowEntryToken):
393                self.scanner.get()
394        token = self.scanner.get()
[116]395        yield CollectionEndEvent(token.start_mark, token.end_mark)
[43]396
397    def parse_flow_mapping(self):
[51]398        # flow_mapping      ::= FLOW-MAPPING-START
399        #                       (flow_mapping_entry FLOW-ENTRY)*
400        #                       flow_mapping_entry?
401        #                       FLOW-MAPPING-END
402        # flow_mapping_entry    ::= flow_node | KEY flow_node? (VALUE flow_node?)?
403        token = self.scanner.get()
[116]404        start_mark = token.start_mark
[51]405        while not self.scanner.check(FlowMappingEndToken):
406            if self.scanner.check(KeyToken):
407                token = self.scanner.get()
408                if not self.scanner.check(ValueToken,
409                        FlowEntryToken, FlowMappingEndToken):
410                    for event in self.parse_flow_node():
411                        yield event
412                else:
[116]413                    yield self.process_empty_scalar(token.end_mark)
[51]414                if self.scanner.check(ValueToken):
415                    token = self.scanner.get()
416                    if not self.scanner.check(FlowEntryToken, FlowMappingEndToken):
417                        for event in self.parse_flow_node():
418                            yield event
419                    else:
[116]420                        yield self.process_empty_scalar(token.end_mark)
[51]421                else:
422                    token = self.scanner.peek()
[116]423                    yield self.process_empty_scalar(token.start_mark)
[43]424            else:
[51]425                for event in self.parse_flow_node():
426                    yield event
[116]427                yield self.process_empty_scalar(self.scanner.peek().start_mark)
[51]428            if not self.scanner.check(FlowEntryToken, FlowMappingEndToken):
429                token = self.scanner.peek()
[116]430                raise ParserError("while scanning a flow mapping", start_mark,
431                        "expected ',' or '}', but got %r" % token.id, token.start_mark)
[51]432            if self.scanner.check(FlowEntryToken):
433                self.scanner.get()
434        if not self.scanner.check(FlowMappingEndToken):
435            token = self.scanner.peek()
[116]436            raise ParserError("while scanning a flow mapping", start_mark,
437                    "expected '}', but found %r" % token.id, token.start_mark)
[51]438        token = self.scanner.get()
[116]439        yield CollectionEndEvent(token.start_mark, token.end_mark)
[43]440
[116]441    def process_empty_scalar(self, mark):
442        return ScalarEvent(None, None, u'', mark, mark)
[43]443
Note: See TracBrowser for help on using the repository browser.