source: branches/pyyaml3000/lib/yaml/parser.py @ 55

Revision 55, 20.3 KB checked in by xi, 8 years ago (diff)

Working on Constructor.

Line 
1
2# YAML can be parsed by an LL(1) parser!
3#
4# We use the following production rules:
5# stream            ::= implicit_document? explicit_document* STREAM-END
6# explicit_document ::= DIRECTIVE* DOCUMENT-START block_node? DOCUMENT-END?
7# implicit_document ::= block_node DOCUMENT-END?
8# block_node    ::= ALIAS | properties? block_content
9# flow_node     ::= ALIAS | properties? flow_content
10# properties    ::= TAG ANCHOR? | ANCHOR TAG?
11# block_content     ::= block_collection | flow_collection | SCALAR
12# flow_content      ::= flow_collection | SCALAR
13# block_collection  ::= block_sequence | block_mapping
14# block_sequence    ::= BLOCK-SEQUENCE-START (BLOCK-ENTRY block_node?)* BLOCK-END
15# block_mapping     ::= BLOCK-MAPPING_START ((KEY block_node_or_indentless_sequence?)? (VALUE block_node_or_indentless_sequence?)?)* BLOCK-END
16# block_node_or_indentless_sequence ::= ALIAS | properties? (block_content | indentless_block_sequence)
17# indentless_block_sequence         ::= (BLOCK-ENTRY block_node?)+
18# flow_collection   ::= flow_sequence | flow_mapping
19# flow_sequence     ::= FLOW-SEQUENCE-START (flow_sequence_entry FLOW-ENTRY)* flow_sequence_entry? FLOW-SEQUENCE-END
20# flow_mapping      ::= FLOW-MAPPING-START (flow_mapping_entry FLOW-ENTRY)* flow_mapping_entry? FLOW-MAPPING-END
21# flow_sequence_entry   ::= flow_node | KEY flow_node? (VALUE flow_node?)?
22# flow_mapping_entry    ::= flow_node | KEY flow_node? (VALUE flow_node?)?
23
24# TODO: support for BOM within a stream.
25# stream ::= (BOM? implicit_document)? (BOM? explicit_document)* STREAM-END
26
27# Note that there is a slight deviation from the specification. We require a
28# non-empty node content if ANCHOR or TAG is specified. This disallow such
29# documents as
30#
31#   key:    !!str   # empty value
32#
33# This is done to prevent ambiguity in parsing tags and aliases:
34#
35#   {   !!perl/YAML::Parser:    value }
36#
37# What is it? Should it be interpreted as
38#   {   ? !<tag:yaml.org,2002:perl/YAML::Parser> '' : value }
39# or
40#   {   ? !<tag:yaml.org,2002:perl/YAML::Parser:> value : '' }
41# Since we disallow non-empty node content, tags are always followed by spaces
42# or line breaks.
43
44# FIRST sets:
45# stream: FIRST(block_node) + { DIRECTIVE DOCUMENT-START }
46# explicit_document: { DIRECTIVE DOCUMENT-START }
47# implicit_document: FIRST(block_node)
48# block_node: { ALIAS TAG ANCHOR SCALAR BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START }
49# flow_node: { ALIAS ANCHOR TAG SCALAR FLOW-SEQUENCE-START FLOW-MAPPING-START }
50# block_content: { BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START SCALAR }
51# flow_content: { FLOW-SEQUENCE-START FLOW-MAPPING-START SCALAR }
52# block_collection: { BLOCK-SEQUENCE-START BLOCK-MAPPING-START }
53# flow_collection: { FLOW-SEQUENCE-START FLOW-MAPPING-START }
54# block_sequence: { BLOCK-SEQUENCE-START }
55# block_mapping: { BLOCK-MAPPING-START }
56# block_node_or_indentless_sequence: { ALIAS ANCHOR TAG SCALAR BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START BLOCK-ENTRY }
57# indentless_sequence: { ENTRY }
58# flow_collection: { FLOW-SEQUENCE-START FLOW-MAPPING-START }
59# flow_sequence: { FLOW-SEQUENCE-START }
60# flow_mapping: { FLOW-MAPPING-START }
61# flow_sequence_entry: { ALIAS ANCHOR TAG SCALAR FLOW-SEQUENCE-START FLOW-MAPPING-START KEY }
62# flow_mapping_entry: { ALIAS ANCHOR TAG SCALAR FLOW-SEQUENCE-START FLOW-MAPPING-START KEY }
63
64from error import MarkedYAMLError
65from tokens import *
66from events import *
67
68class ParserError(MarkedYAMLError):
69    pass
70
71class Parser:
72    # Since writing an LL(1) parser is a straightforward task, we do not give
73    # many comments here.
74    # Note that we use Python generators. If you rewrite the parser in another
75    # language, you may replace all 'yield'-s with event handler calls.
76
77    DEFAULT_TAGS = {
78        u'!':   u'!',
79        u'!!':  u'tag:yaml.org,2002:',
80    }
81
82    def __init__(self, scanner):
83        self.scanner = scanner
84        self.current_event = None
85        self.yaml_version = None
86        self.tag_handles = {}
87        self.event_generator = self.parse_stream()
88
89    def check(self, *choices):
90        # Check the type of the next event.
91        if self.current_event is None:
92            try:
93                self.current_event = self.event_generator.next()
94            except StopIteration:
95                pass
96        if self.current_event is not None:
97            for choice in choices:
98                if isinstance(self.current_event, choice):
99                    return True
100        return False
101
102    def peek(self):
103        # Get the next event.
104        if self.current_event is None:
105            try:
106                self.current_event = self.event_generator.next()
107            except StopIteration:
108                pass
109        return self.current_event
110
111    def get(self):
112        # Get the next event.
113        if self.current_event is None:
114            try:
115                self.current_event = self.event_generator.next()
116            except StopIteration:
117                pass
118        value = self.current_event
119        self.current_event = None
120        return value
121
122    def __iter__(self):
123        # Iterator protocol.
124        return self.event_generator
125
126    def parse_stream(self):
127        # implicit_document? explicit_document* STREAM-END
128
129        # Parse implicit document.
130        if not self.scanner.check(DirectiveToken, DocumentStartToken,
131                StreamEndToken):
132            self.tag_handles = self.DEFAULT_TAGS
133            for event in self.parse_block_node():
134                yield event
135
136        # Parse explicit documents.
137        while not self.scanner.check(StreamEndToken):
138            self.process_directives()
139            if not self.scanner.check(DocumentStartToken):
140                raise ParserError(None, None,
141                        "expected '<document start>', but found %r"
142                        % self.scanner.peek().id,
143                        self.scanner.peek().start_marker)
144            token = self.scanner.get()
145            if self.scanner.check(DirectiveToken,
146                    DocumentStartToken, DocumentEndToken, StreamEndToken):
147                yield self.process_empty_scalar(token.end_marker)
148            else:
149                for event in self.parse_block_node():
150                    yield event
151            while self.scanner.check(DocumentEndToken):
152                self.scanner.get()
153
154        # Parse end of stream.
155        token = self.scanner.get()
156        yield StreamEndEvent(token.start_marker, token.end_marker)
157
158    def process_directives(self):
159        # DIRECTIVE*
160        self.yaml_version = None
161        self.tag_handles = {}
162        while self.scanner.check(DirectiveToken):
163            token = self.scanner.get()
164            if token.name == u'YAML':
165                if self.yaml_version is not None:
166                    raise ParserError(None, None,
167                            "found duplicate YAML directive", token.start_marker)
168                major, minor = token.value
169                if major != 1:
170                    raise ParserError(None, None,
171                            "found incompatible YAML document (version 1.* is required)",
172                            token.start_marker)
173                self.yaml_version = token.value
174            elif token.name == u'TAG':
175                handle, prefix = token.value
176                if handle in self.tag_handles:
177                    raise ParserError(None, None,
178                            "duplicate tag handle %r" % handle.encode('utf-8'),
179                            token.start_marker)
180                self.tag_handles[handle] = prefix
181        for key in self.DEFAULT_TAGS:
182            if key not in self.tag_handles:
183                self.tag_handles[key] = self.DEFAULT_TAGS[key]
184
185    def parse_block_node(self):
186        return self.parse_node(block=True)
187
188    def parse_flow_node(self):
189        return self.parse_node()
190
191    def parse_block_node_or_indentless_sequence(self):
192        return self.parse_node(block=True, indentless_sequence=True)
193
194    def parse_node(self, block=False, indentless_sequence=False):
195        # block_node    ::= ALIAS | properties? block_content
196        # flow_node     ::= ALIAS | properties? flow_content
197        # properties    ::= TAG ANCHOR? | ANCHOR TAG?
198        # block_content     ::= block_collection | flow_collection | SCALAR
199        # flow_content      ::= flow_collection | SCALAR
200        # block_collection  ::= block_sequence | block_mapping
201        # block_node_or_indentless_sequence ::= ALIAS | properties?
202        #                                       (block_content | indentless_block_sequence)
203        if self.scanner.check(AliasToken):
204            token = self.scanner.get()
205            yield AliasEvent(token.value, token.start_marker, token.end_marker)
206        else:
207            anchor = None
208            tag = None
209            start_marker = end_marker = tag_marker = None
210            if self.scanner.check(AnchorToken):
211                token = self.scanner.get()
212                start_marker = end_marker = token.start_marker
213                anchor = token.value
214                if self.scanner.check(TagToken):
215                    token = self.scanner.get()
216                    end_marker = tag_marker = token.start_marker
217                    tag = token.value
218            elif self.scanner.check(TagToken):
219                token = self.scanner.get()
220                start_marker = end_marker = tag_marker = token.start_marker
221                tag = token.value
222                if self.scanner.check(AnchorToken):
223                    token = self.scanner.get()
224                    end_marker = token.start_marker
225                    anchor = token.value
226            if tag is not None:
227                handle, suffix = tag
228                if handle is not None:
229                    if handle not in self.tag_handles:
230                        raise ParserError("while parsing a node", start_marker,
231                                "found undefined tag handle %r" % handle.encode('utf-8'),
232                                tag_marker)
233                    tag = self.tag_handles[handle]+suffix
234                else:
235                    tag = suffix
236            if tag is None:
237                if not (self.scanner.check(ScalarToken) and
238                        self.scanner.peek().plain):
239                    tag = u'!'
240            if start_marker is None:
241                start_marker = self.scanner.peek().start_marker
242            event = None
243            collection_events = None
244            if indentless_sequence and self.scanner.check(BlockEntryToken):
245                end_marker = self.scanner.peek().end_marker
246                event = SequenceEvent(anchor, tag, start_marker, end_marker)
247                collection_events = self.parse_indentless_sequence()
248            else:
249                if self.scanner.check(ScalarToken):
250                    token = self.scanner.get()
251                    end_marker = token.end_marker
252                    event = ScalarEvent(anchor, tag, token.value,
253                            start_marker, end_marker)
254                elif self.scanner.check(FlowSequenceStartToken):
255                    end_marker = self.scanner.peek().end_marker
256                    event = SequenceEvent(anchor, tag, start_marker, end_marker)
257                    collection_events = self.parse_flow_sequence()
258                elif self.scanner.check(FlowMappingStartToken):
259                    end_marker = self.scanner.peek().end_marker
260                    event = MappingEvent(anchor, tag, start_marker, end_marker)
261                    collection_events = self.parse_flow_mapping()
262                elif block and self.scanner.check(BlockSequenceStartToken):
263                    end_marker = self.scanner.peek().start_marker
264                    event = SequenceEvent(anchor, tag, start_marker, end_marker)
265                    collection_events = self.parse_block_sequence()
266                elif block and self.scanner.check(BlockMappingStartToken):
267                    end_marker = self.scanner.peek().start_marker
268                    event = MappingEvent(anchor, tag, start_marker, end_marker)
269                    collection_events = self.parse_block_mapping()
270                else:
271                    if block:
272                        node = 'block'
273                    else:
274                        node = 'flow'
275                    token = self.scanner.peek()
276                    raise ParserError("while scanning a %s node" % node, start_marker,
277                            "expected the node content, but found %r" % token.id,
278                            token.start_marker)
279            yield event
280            if collection_events is not None:
281                for event in collection_events:
282                    yield event
283
284    def parse_block_sequence(self):
285        # BLOCK-SEQUENCE-START (BLOCK-ENTRY block_node?)* BLOCK-END
286        token = self.scanner.get()
287        start_marker = token.start_marker
288        while self.scanner.check(BlockEntryToken):
289            token = self.scanner.get()
290            if not self.scanner.check(BlockEntryToken, BlockEndToken):
291                for event in self.parse_block_node():
292                    yield event
293            else:
294                yield self.process_empty_scalar(token.end_marker)
295        if not self.scanner.check(BlockEndToken):
296            token = self.scanner.peek()
297            raise ParserError("while scanning a block collection", start_marker,
298                    "expected <block end>, but found %r" % token.id, token.start_marker)
299        token = self.scanner.get()
300        yield CollectionEndEvent(token.start_marker, token.end_marker)
301
302    def parse_indentless_sequence(self):
303        # (BLOCK-ENTRY block_node?)+
304        while self.scanner.check(BlockEntryToken):
305            token = self.scanner.get()
306            if not self.scanner.check(BlockEntryToken,
307                    KeyToken, ValueToken, BlockEndToken):
308                for event in self.parse_block_node():
309                    yield event
310            else:
311                yield self.process_empty_scalar(token.end_marker)
312        token = self.scanner.peek()
313        yield CollectionEndEvent(token.start_marker, token.start_marker)
314
315    def parse_block_mapping(self):
316        # BLOCK-MAPPING_START
317        #   ((KEY block_node_or_indentless_sequence?)?
318        #   (VALUE block_node_or_indentless_sequence?)?)*
319        # BLOCK-END
320        token = self.scanner.get()
321        start_marker = token.start_marker
322        while self.scanner.check(KeyToken, ValueToken):
323            if self.scanner.check(KeyToken):
324                token = self.scanner.get()
325                if not self.scanner.check(KeyToken, ValueToken, BlockEndToken):
326                    for event in self.parse_block_node_or_indentless_sequence():
327                        yield event
328                else:
329                    yield self.process_empty_scalar(token.end_marker)
330            if self.scanner.check(ValueToken):
331                token = self.scanner.get()
332                if not self.scanner.check(KeyToken, ValueToken, BlockEndToken):
333                    for event in self.parse_block_node_or_indentless_sequence():
334                        yield event
335                else:
336                    yield self.process_empty_scalar(token.end_marker)
337            else:
338                token = self.scanner.peek()
339                yield self.process_empty_scalar(token.start_marker)
340        if not self.scanner.check(BlockEndToken):
341            token = self.scanner.peek()
342            raise ParserError("while scanning a block mapping", start_marker,
343                    "expected <block end>, but found %r" % token.id, token.start_marker)
344        token = self.scanner.get()
345        yield CollectionEndEvent(token.start_marker, token.end_marker)
346
347    def parse_flow_sequence(self):
348        # flow_sequence     ::= FLOW-SEQUENCE-START
349        #                       (flow_sequence_entry FLOW-ENTRY)*
350        #                       flow_sequence_entry?
351        #                       FLOW-SEQUENCE-END
352        # flow_sequence_entry   ::= flow_node | KEY flow_node? (VALUE flow_node?)?
353        #
354        # Note that while production rules for both flow_sequence_entry and
355        # flow_mapping_entry are equal, their interpretations are different.
356        # For `flow_sequence_entry`, the part `KEY flow_node? (VALUE flow_node?)?`
357        # generate an inline mapping (set syntax).
358        token = self.scanner.get()
359        start_marker = token.start_marker
360        while not self.scanner.check(FlowSequenceEndToken):
361            if self.scanner.check(KeyToken):
362                token = self.scanner.get()
363                yield MappingEvent(None, u'!',
364                        token.start_marker, token.end_marker)
365                if not self.scanner.check(ValueToken,
366                        FlowEntryToken, FlowSequenceEndToken):
367                    for event in self.parse_flow_node():
368                        yield event
369                else:
370                    yield self.process_empty_scalar(token.end_marker)
371                if self.scanner.check(ValueToken):
372                    token = self.scanner.get()
373                    if not self.scanner.check(FlowEntryToken, FlowSequenceEndToken):
374                        for event in self.parse_flow_node():
375                            yield event
376                    else:
377                        yield self.process_empty_scalar(token.end_marker)
378                else:
379                    token = self.scanner.peek()
380                    yield self.process_empty_scalar(token.start_marker)
381                token = self.scanner.peek()
382                yield CollectionEndEvent(token.start_marker, token.start_marker)
383            else:
384                for event in self.parse_flow_node():
385                    yield event
386            if not self.scanner.check(FlowEntryToken, FlowSequenceEndToken):
387                token = self.scanner.peek()
388                raise ParserError("while scanning a flow sequence", start_marker,
389                        "expected ',' or ']', but got %r" % token.id, token.start_marker)
390            if self.scanner.check(FlowEntryToken):
391                self.scanner.get()
392        token = self.scanner.get()
393        yield CollectionEndEvent(token.start_marker, token.end_marker)
394
395    def parse_flow_mapping(self):
396        # flow_mapping      ::= FLOW-MAPPING-START
397        #                       (flow_mapping_entry FLOW-ENTRY)*
398        #                       flow_mapping_entry?
399        #                       FLOW-MAPPING-END
400        # flow_mapping_entry    ::= flow_node | KEY flow_node? (VALUE flow_node?)?
401        token = self.scanner.get()
402        start_marker = token.start_marker
403        while not self.scanner.check(FlowMappingEndToken):
404            if self.scanner.check(KeyToken):
405                token = self.scanner.get()
406                if not self.scanner.check(ValueToken,
407                        FlowEntryToken, FlowMappingEndToken):
408                    for event in self.parse_flow_node():
409                        yield event
410                else:
411                    yield self.process_empty_scalar(token.end_marker)
412                if self.scanner.check(ValueToken):
413                    token = self.scanner.get()
414                    if not self.scanner.check(FlowEntryToken, FlowMappingEndToken):
415                        for event in self.parse_flow_node():
416                            yield event
417                    else:
418                        yield self.process_empty_scalar(token.end_marker)
419                else:
420                    token = self.scanner.peek()
421                    yield self.process_empty_scalar(token.start_marker)
422            else:
423                for event in self.parse_flow_node():
424                    yield event
425                yield self.process_empty_scalar(self.scanner.peek().start_marker)
426            if not self.scanner.check(FlowEntryToken, FlowMappingEndToken):
427                token = self.scanner.peek()
428                raise ParserError("while scanning a flow mapping", start_marker,
429                        "expected ',' or '}', but got %r" % token.id, token.start_marker)
430            if self.scanner.check(FlowEntryToken):
431                self.scanner.get()
432        if not self.scanner.check(FlowMappingEndToken):
433            token = self.scanner.peek()
434            raise ParserError("while scanning a flow mapping", start_marker,
435                    "expected '}', but found %r" % token.id, token.start_marker)
436        token = self.scanner.get()
437        yield CollectionEndEvent(token.start_marker, token.end_marker)
438
439    def process_empty_scalar(self, marker):
440        return ScalarEvent(None, None, u'', marker, marker)
441
Note: See TracBrowser for help on using the repository browser.