source: branches/pyyaml3000/lib/yaml/parser.py @ 52

Revision 52, 20.0 KB checked in by xi, 8 years ago (diff)

Cleanup error messages.

Line 
1
2# YAML can be parsed by an LL(1) parser!
3#
4# We use the following production rules:
5# stream            ::= implicit_document? explicit_document* STREAM-END
6# explicit_document ::= DIRECTIVE* DOCUMENT-START block_node? DOCUMENT-END?
7# implicit_document ::= block_node DOCUMENT-END?
8# block_node    ::= ALIAS | properties? block_content
9# flow_node     ::= ALIAS | properties? flow_content
10# properties    ::= TAG ANCHOR? | ANCHOR TAG?
11# block_content     ::= block_collection | flow_collection | SCALAR
12# flow_content      ::= flow_collection | SCALAR
13# block_collection  ::= block_sequence | block_mapping
14# block_sequence    ::= BLOCK-SEQUENCE-START (BLOCK-ENTRY block_node?)* BLOCK-END
15# block_mapping     ::= BLOCK-MAPPING_START ((KEY block_node_or_indentless_sequence?)? (VALUE block_node_or_indentless_sequence?)?)* BLOCK-END
16# block_node_or_indentless_sequence ::= ALIAS | properties? (block_content | indentless_block_sequence)
17# indentless_block_sequence         ::= (BLOCK-ENTRY block_node?)+
18# flow_collection   ::= flow_sequence | flow_mapping
19# flow_sequence     ::= FLOW-SEQUENCE-START (flow_sequence_entry FLOW-ENTRY)* flow_sequence_entry? FLOW-SEQUENCE-END
20# flow_mapping      ::= FLOW-MAPPING-START (flow_mapping_entry FLOW-ENTRY)* flow_mapping_entry? FLOW-MAPPING-END
21# flow_sequence_entry   ::= flow_node | KEY flow_node? (VALUE flow_node?)?
22# flow_mapping_entry    ::= flow_node | KEY flow_node? (VALUE flow_node?)?
23
24# TODO: support for BOM within a stream.
25# stream ::= (BOM? implicit_document)? (BOM? explicit_document)* STREAM-END
26
27# Note that there is a slight deviation from the specification. We require a
28# non-empty node content if ANCHOR or TAG is specified. This disallow such
29# documents as
30#
31#   key:    !!str   # empty value
32#
33# This is done to prevent ambiguity in parsing tags and aliases:
34#
35#   {   !!perl/YAML::Parser:    value }
36#
37# What is it? Should it be interpreted as
38#   {   ? !<tag:yaml.org,2002:perl/YAML::Parser> '' : value }
39# or
40#   {   ? !<tag:yaml.org,2002:perl/YAML::Parser:> value : '' }
41# Since we disallow non-empty node content, tags are always followed by spaces
42# or line breaks.
43
44# FIRST sets:
45# stream: FIRST(block_node) + { DIRECTIVE DOCUMENT-START }
46# explicit_document: { DIRECTIVE DOCUMENT-START }
47# implicit_document: FIRST(block_node)
48# block_node: { ALIAS TAG ANCHOR SCALAR BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START }
49# flow_node: { ALIAS ANCHOR TAG SCALAR FLOW-SEQUENCE-START FLOW-MAPPING-START }
50# block_content: { BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START SCALAR }
51# flow_content: { FLOW-SEQUENCE-START FLOW-MAPPING-START SCALAR }
52# block_collection: { BLOCK-SEQUENCE-START BLOCK-MAPPING-START }
53# flow_collection: { FLOW-SEQUENCE-START FLOW-MAPPING-START }
54# block_sequence: { BLOCK-SEQUENCE-START }
55# block_mapping: { BLOCK-MAPPING-START }
56# block_node_or_indentless_sequence: { ALIAS ANCHOR TAG SCALAR BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START BLOCK-ENTRY }
57# indentless_sequence: { ENTRY }
58# flow_collection: { FLOW-SEQUENCE-START FLOW-MAPPING-START }
59# flow_sequence: { FLOW-SEQUENCE-START }
60# flow_mapping: { FLOW-MAPPING-START }
61# flow_sequence_entry: { ALIAS ANCHOR TAG SCALAR FLOW-SEQUENCE-START FLOW-MAPPING-START KEY }
62# flow_mapping_entry: { ALIAS ANCHOR TAG SCALAR FLOW-SEQUENCE-START FLOW-MAPPING-START KEY }
63
64from error import MarkedYAMLError
65from tokens import *
66from events import *
67
68class ParserError(MarkedYAMLError):
69    pass
70
71class Parser:
72    # Since writing an LL(1) parser is a straightforward task, we do not give
73    # many comments here.
74    # Note that we use Python generators. If you rewrite the parser to another
75    # language, you may replace all 'yield'-s with event handler calls.
76
77    DEFAULT_TAGS = {
78        u'!':   u'!',
79        u'!!':  u'tag:yaml.org,2002:',
80    }
81
82    def __init__(self, scanner):
83        self.scanner = scanner
84        self.current_event = None
85        self.yaml_version = None
86        self.tag_handles = {}
87        self.event_generator = self.parse_stream()
88
89    def check(self, *choices):
90        # Check the type of the next event.
91        if self.current_event is None:
92            try:
93                self.current_event = self.event_generator.next()
94            except StopIteration:
95                pass
96        if self.current_event is not None:
97            for choice in choices:
98                if isinstance(self.current_event, choice):
99                    return True
100        return False
101
102    def get(self):
103        # Get the next event.
104        if self.current_event is None:
105            try:
106                self.current_event = self.event_generator.next()
107            except StopIteration:
108                pass
109        value = self.current_event
110        self.current_event = None
111        return value
112
113    def __iter__(self):
114        # Iterator protocol.
115        return self.event_generator
116
117    def parse_stream(self):
118        # implicit_document? explicit_document* STREAM-END
119
120        # Parse implicit document.
121        if not self.scanner.check(DirectiveToken, DocumentStartToken,
122                StreamEndToken):
123            self.tag_handles = self.DEFAULT_TAGS
124            for event in self.parse_block_node():
125                yield event
126
127        # Parse explicit documents.
128        while not self.scanner.check(StreamEndToken):
129            self.process_directives()
130            if not self.scanner.check(DocumentStartToken):
131                raise ParserError(None, None,
132                        "expected '<document start>', but found %r"
133                        % self.scanner.peek().id,
134                        self.scanner.peek().start_marker)
135            token = self.scanner.get()
136            if self.scanner.check(DirectiveToken,
137                    DocumentStartToken, DocumentEndToken, StreamEndToken):
138                yield self.process_empty_scalar(token.end_marker)
139            else:
140                for event in self.parse_block_node():
141                    yield event
142            while self.scanner.check(DocumentEndToken):
143                self.scanner.get()
144
145        # Parse end of stream.
146        token = self.scanner.get()
147        yield StreamEndEvent(token.start_marker, token.end_marker)
148
149    def process_directives(self):
150        # DIRECTIVE*
151        self.yaml_version = None
152        self.tag_handles = {}
153        while self.scanner.check(DirectiveToken):
154            token = self.scanner.get()
155            if token.name == u'YAML':
156                if self.yaml_version is not None:
157                    raise ParserError(None, None,
158                            "found duplicate YAML directive", token.start_marker)
159                major, minor = token.value
160                if major != 1:
161                    raise ParserError(None, None,
162                            "found incompatible YAML document (version 1.* is required)",
163                            token.start_marker)
164                self.yaml_version = token.value
165            elif token.name == u'TAG':
166                handle, prefix = token.value
167                if handle in self.tag_handles:
168                    raise ParserError(None, None,
169                            "duplicate tag handle %r" % handle.encode('utf-8'),
170                            token.start_marker)
171                self.tag_handles[handle] = prefix
172        for key in self.DEFAULT_TAGS:
173            if key not in self.tag_handles:
174                self.tag_handles[key] = self.DEFAULT_TAGS[key]
175
176    def parse_block_node(self):
177        return self.parse_node(block=True)
178
179    def parse_flow_node(self):
180        return self.parse_node()
181
182    def parse_block_node_or_indentless_sequence(self):
183        return self.parse_node(block=True, indentless_sequence=True)
184
185    def parse_node(self, block=False, indentless_sequence=False):
186        # block_node    ::= ALIAS | properties? block_content
187        # flow_node     ::= ALIAS | properties? flow_content
188        # properties    ::= TAG ANCHOR? | ANCHOR TAG?
189        # block_content     ::= block_collection | flow_collection | SCALAR
190        # flow_content      ::= flow_collection | SCALAR
191        # block_collection  ::= block_sequence | block_mapping
192        # block_node_or_indentless_sequence ::= ALIAS | properties?
193        #                                       (block_content | indentless_block_sequence)
194        if self.scanner.check(AliasToken):
195            token = self.scanner.get()
196            yield AliasEvent(token.value, token.start_marker, token.end_marker)
197        else:
198            anchor = None
199            tag = None
200            start_marker = end_marker = tag_marker = None
201            if self.scanner.check(AnchorToken):
202                token = self.scanner.get()
203                start_marker = end_marker = token.start_marker
204                anchor = token.value
205                if self.scanner.check(TagToken):
206                    token = self.scanner.get()
207                    end_marker = tag_marker = token.start_marker
208                    tag = token.value
209            elif self.scanner.check(TagToken):
210                token = self.scanner.get()
211                start_marker = end_marker = tag_marker = token.start_marker
212                tag = token.value
213                if self.scanner.check(AnchorToken):
214                    token = self.scanner.get()
215                    end_marker = token.start_marker
216                    anchor = token.value
217            if tag is not None:
218                handle, suffix = tag
219                if handle is not None:
220                    if handle not in self.tag_handles:
221                        raise ParserError("while parsing a node", start_marker,
222                                "found undefined tag handle %r" % handle.encode('utf-8'),
223                                tag_marker)
224                    tag = self.tag_handles[handle]+suffix
225                else:
226                    tag = suffix
227            if tag is None:
228                if not (self.scanner.check(ScalarToken) and
229                        self.scanner.peek().plain):
230                    tag = u'!'
231            if start_marker is None:
232                start_marker = self.scanner.peek().start_marker
233            event = None
234            collection_events = None
235            if indentless_sequence and self.scanner.check(BlockEntryToken):
236                end_marker = self.scanner.peek().end_marker
237                event = SequenceEvent(anchor, tag, start_marker, end_marker)
238                collection_events = self.parse_indentless_sequence()
239            else:
240                if self.scanner.check(ScalarToken):
241                    token = self.scanner.get()
242                    end_marker = token.end_marker
243                    event = ScalarEvent(anchor, tag, token.value,
244                            start_marker, end_marker)
245                elif self.scanner.check(FlowSequenceStartToken):
246                    end_marker = self.scanner.peek().end_marker
247                    event = SequenceEvent(anchor, tag, start_marker, end_marker)
248                    collection_events = self.parse_flow_sequence()
249                elif self.scanner.check(FlowMappingStartToken):
250                    end_marker = self.scanner.peek().end_marker
251                    event = MappingEvent(anchor, tag, start_marker, end_marker)
252                    collection_events = self.parse_flow_mapping()
253                elif block and self.scanner.check(BlockSequenceStartToken):
254                    end_marker = self.scanner.peek().start_marker
255                    event = SequenceEvent(anchor, tag, start_marker, end_marker)
256                    collection_events = self.parse_block_sequence()
257                elif block and self.scanner.check(BlockMappingStartToken):
258                    end_marker = self.scanner.peek().start_marker
259                    event = MappingEvent(anchor, tag, start_marker, end_marker)
260                    collection_events = self.parse_block_mapping()
261                else:
262                    if block:
263                        node = 'block'
264                    else:
265                        node = 'flow'
266                    token = self.scanner.peek()
267                    raise ParserError("while scanning a %s node" % node, start_marker,
268                            "expected the node content, but found %r" % token.id,
269                            token.start_marker)
270            yield event
271            if collection_events is not None:
272                for event in collection_events:
273                    yield event
274
275    def parse_block_sequence(self):
276        # BLOCK-SEQUENCE-START (BLOCK-ENTRY block_node?)* BLOCK-END
277        token = self.scanner.get()
278        start_marker = token.start_marker
279        while self.scanner.check(BlockEntryToken):
280            token = self.scanner.get()
281            if not self.scanner.check(BlockEntryToken, BlockEndToken):
282                for event in self.parse_block_node():
283                    yield event
284            else:
285                yield self.process_empty_scalar(token.end_marker)
286        if not self.scanner.check(BlockEndToken):
287            token = self.scanner.peek()
288            raise ParserError("while scanning a block collection", start_marker,
289                    "expected <block end>, but found %r" % token.id, token.start_marker)
290        token = self.scanner.get()
291        yield CollectionEndEvent(token.start_marker, token.end_marker)
292
293    def parse_indentless_sequence(self):
294        # (BLOCK-ENTRY block_node?)+
295        while self.scanner.check(BlockEntryToken):
296            token = self.scanner.get()
297            if not self.scanner.check(BlockEntryToken,
298                    KeyToken, ValueToken, BlockEndToken):
299                for event in self.parse_block_node():
300                    yield event
301            else:
302                yield self.process_empty_scalar(token.end_marker)
303        token = self.scanner.peek()
304        yield CollectionEndEvent(token.start_marker, token.start_marker)
305
306    def parse_block_mapping(self):
307        # BLOCK-MAPPING_START
308        #   ((KEY block_node_or_indentless_sequence?)?
309        #   (VALUE block_node_or_indentless_sequence?)?)*
310        # BLOCK-END
311        token = self.scanner.get()
312        start_marker = token.start_marker
313        while self.scanner.check(KeyToken, ValueToken):
314            if self.scanner.check(KeyToken):
315                token = self.scanner.get()
316                if not self.scanner.check(KeyToken, ValueToken, BlockEndToken):
317                    for event in self.parse_block_node_or_indentless_sequence():
318                        yield event
319                else:
320                    yield self.process_empty_scalar(token.end_marker)
321            if self.scanner.check(ValueToken):
322                token = self.scanner.get()
323                if not self.scanner.check(KeyToken, ValueToken, BlockEndToken):
324                    for event in self.parse_block_node_or_indentless_sequence():
325                        yield event
326                else:
327                    yield self.process_empty_scalar(token.end_marker)
328            else:
329                token = self.scanner.peek()
330                yield self.process_empty_scalar(token.start_marker)
331        if not self.scanner.check(BlockEndToken):
332            token = self.scanner.peek()
333            raise ParserError("while scanning a block mapping", start_marker,
334                    "expected <block end>, but found %r" % token.id, token.start_marker)
335        token = self.scanner.get()
336        yield CollectionEndEvent(token.start_marker, token.end_marker)
337
338    def parse_flow_sequence(self):
339        # flow_sequence     ::= FLOW-SEQUENCE-START
340        #                       (flow_sequence_entry FLOW-ENTRY)*
341        #                       flow_sequence_entry?
342        #                       FLOW-SEQUENCE-END
343        # flow_sequence_entry   ::= flow_node | KEY flow_node? (VALUE flow_node?)?
344        #
345        # Note that while production rules for both flow_sequence_entry and
346        # flow_mapping_entry are equal, their interpretations are different.
347        # For `flow_sequence_entry`, the part `KEY flow_node? (VALUE flow_node?)?`
348        # generate an inline mapping (set syntax).
349        token = self.scanner.get()
350        start_marker = token.start_marker
351        while not self.scanner.check(FlowSequenceEndToken):
352            if self.scanner.check(KeyToken):
353                token = self.scanner.get()
354                yield MappingEvent(None, u'!',
355                        token.start_marker, token.end_marker)
356                if not self.scanner.check(ValueToken,
357                        FlowEntryToken, FlowSequenceEndToken):
358                    for event in self.parse_flow_node():
359                        yield event
360                else:
361                    yield self.process_empty_scalar(token.end_marker)
362                if self.scanner.check(ValueToken):
363                    token = self.scanner.get()
364                    if not self.scanner.check(FlowEntryToken, FlowSequenceEndToken):
365                        for event in self.parse_flow_node():
366                            yield event
367                    else:
368                        yield self.process_empty_scalar(token.end_marker)
369                else:
370                    token = self.scanner.peek()
371                    yield self.process_empty_scalar(token.start_marker)
372                token = self.scanner.peek()
373                yield CollectionEndEvent(token.start_marker, token.start_marker)
374            else:
375                for event in self.parse_flow_node():
376                    yield event
377            if not self.scanner.check(FlowEntryToken, FlowSequenceEndToken):
378                token = self.scanner.peek()
379                raise ParserError("while scanning a flow sequence", start_marker,
380                        "expected ',' or ']', but got %r" % token.id, token.start_marker)
381            if self.scanner.check(FlowEntryToken):
382                self.scanner.get()
383        token = self.scanner.get()
384        yield CollectionEndEvent(token.start_marker, token.end_marker)
385
386    def parse_flow_mapping(self):
387        # flow_mapping      ::= FLOW-MAPPING-START
388        #                       (flow_mapping_entry FLOW-ENTRY)*
389        #                       flow_mapping_entry?
390        #                       FLOW-MAPPING-END
391        # flow_mapping_entry    ::= flow_node | KEY flow_node? (VALUE flow_node?)?
392        token = self.scanner.get()
393        start_marker = token.start_marker
394        while not self.scanner.check(FlowMappingEndToken):
395            if self.scanner.check(KeyToken):
396                token = self.scanner.get()
397                if not self.scanner.check(ValueToken,
398                        FlowEntryToken, FlowMappingEndToken):
399                    for event in self.parse_flow_node():
400                        yield event
401                else:
402                    yield self.process_empty_scalar(token.end_marker)
403                if self.scanner.check(ValueToken):
404                    token = self.scanner.get()
405                    if not self.scanner.check(FlowEntryToken, FlowMappingEndToken):
406                        for event in self.parse_flow_node():
407                            yield event
408                    else:
409                        yield self.process_empty_scalar(token.end_marker)
410                else:
411                    token = self.scanner.peek()
412                    yield self.process_empty_scalar(token.start_marker)
413            else:
414                for event in self.parse_flow_node():
415                    yield event
416                yield self.process_empty_scalar(self.scanner.peek().start_marker)
417            if not self.scanner.check(FlowEntryToken, FlowMappingEndToken):
418                token = self.scanner.peek()
419                raise ParserError("while scanning a flow mapping", start_marker,
420                        "expected ',' or '}', but got %r" % token.id, token.start_marker)
421            if self.scanner.check(FlowEntryToken):
422                self.scanner.get()
423        if not self.scanner.check(FlowMappingEndToken):
424            token = self.scanner.peek()
425            raise ParserError("while scanning a flow mapping", start_marker,
426                    "expected '}', but found %r" % token.id, token.start_marker)
427        token = self.scanner.get()
428        yield CollectionEndEvent(token.start_marker, token.end_marker)
429
430    def process_empty_scalar(self, marker):
431        return ScalarEvent(None, None, u'', marker, marker)
432
Note: See TracBrowser for help on using the repository browser.