source: pyyaml/branches/working-on-emitter/lib/yaml/parser.py @ 127

Revision 127, 22.2 KB checked in by xi, 9 years ago (diff)

Parser now provides style information. Allow empty plain scalars if a tag or anchor is given.

Line 
1
2# YAML can be parsed by an LL(1) parser!
3#
4# We use the following production rules:
5# stream            ::= STREAM-START implicit_document? explicit_document* STREAM-END
6# explicit_document ::= DIRECTIVE* DOCUMENT-START block_node? DOCUMENT-END?
7# implicit_document ::= block_node DOCUMENT-END?
8# block_node    ::= ALIAS | properties? block_content
9# flow_node     ::= ALIAS | properties? flow_content
10# properties    ::= TAG ANCHOR? | ANCHOR TAG?
11# block_content     ::= block_collection | flow_collection | SCALAR
12# flow_content      ::= flow_collection | SCALAR
13# block_collection  ::= block_sequence | block_mapping
14# block_sequence    ::= BLOCK-SEQUENCE-START (BLOCK-ENTRY block_node?)* BLOCK-END
15# block_mapping     ::= BLOCK-MAPPING_START ((KEY block_node_or_indentless_sequence?)? (VALUE block_node_or_indentless_sequence?)?)* BLOCK-END
16# block_node_or_indentless_sequence ::= ALIAS | properties? (block_content | indentless_block_sequence)
17# indentless_block_sequence         ::= (BLOCK-ENTRY block_node?)+
18# flow_collection   ::= flow_sequence | flow_mapping
19# flow_sequence     ::= FLOW-SEQUENCE-START (flow_sequence_entry FLOW-ENTRY)* flow_sequence_entry? FLOW-SEQUENCE-END
20# flow_mapping      ::= FLOW-MAPPING-START (flow_mapping_entry FLOW-ENTRY)* flow_mapping_entry? FLOW-MAPPING-END
21# flow_sequence_entry   ::= flow_node | KEY flow_node? (VALUE flow_node?)?
22# flow_mapping_entry    ::= flow_node | KEY flow_node? (VALUE flow_node?)?
23
24# TODO: support for BOM within a stream.
25# stream ::= (BOM? implicit_document)? (BOM? explicit_document)* STREAM-END
26
27# Note that there is a slight deviation from the specification. We require a
28# non-empty node content if ANCHOR or TAG is specified. This disallow such
29# documents as
30#
31#   key:    !!str   # empty value
32#
33# This is done to prevent ambiguity in parsing tags and aliases:
34#
35#   {   !!perl/YAML::Parser:    value }
36#
37# What is it? Should it be interpreted as
38#   {   ? !<tag:yaml.org,2002:perl/YAML::Parser> '' : value }
39# or
40#   {   ? !<tag:yaml.org,2002:perl/YAML::Parser:> value : '' }
41# Since we disallow non-empty node content, tags are always followed by spaces
42# or line breaks.
43
44# FIRST sets:
45# stream: { STREAM-START }
46# explicit_document: { DIRECTIVE DOCUMENT-START }
47# implicit_document: FIRST(block_node)
48# block_node: { ALIAS TAG ANCHOR SCALAR BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START }
49# flow_node: { ALIAS ANCHOR TAG SCALAR FLOW-SEQUENCE-START FLOW-MAPPING-START }
50# block_content: { BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START SCALAR }
51# flow_content: { FLOW-SEQUENCE-START FLOW-MAPPING-START SCALAR }
52# block_collection: { BLOCK-SEQUENCE-START BLOCK-MAPPING-START }
53# flow_collection: { FLOW-SEQUENCE-START FLOW-MAPPING-START }
54# block_sequence: { BLOCK-SEQUENCE-START }
55# block_mapping: { BLOCK-MAPPING-START }
56# block_node_or_indentless_sequence: { ALIAS ANCHOR TAG SCALAR BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START BLOCK-ENTRY }
57# indentless_sequence: { ENTRY }
58# flow_collection: { FLOW-SEQUENCE-START FLOW-MAPPING-START }
59# flow_sequence: { FLOW-SEQUENCE-START }
60# flow_mapping: { FLOW-MAPPING-START }
61# flow_sequence_entry: { ALIAS ANCHOR TAG SCALAR FLOW-SEQUENCE-START FLOW-MAPPING-START KEY }
62# flow_mapping_entry: { ALIAS ANCHOR TAG SCALAR FLOW-SEQUENCE-START FLOW-MAPPING-START KEY }
63
64__all__ = ['Parser', 'ParserError']
65
66from error import MarkedYAMLError
67from tokens import *
68from events import *
69
70class ParserError(MarkedYAMLError):
71    pass
72
73class Parser:
74    # Since writing an LL(1) parser is a straightforward task, we do not give
75    # many comments here.
76    # Note that we use Python generators. If you rewrite the parser in another
77    # language, you may replace all 'yield'-s with event handler calls.
78
79    DEFAULT_TAGS = {
80        u'!':   u'!',
81        u'!!':  u'tag:yaml.org,2002:',
82    }
83
84    def __init__(self, scanner):
85        self.scanner = scanner
86        self.current_event = None
87        self.yaml_version = None
88        self.tag_handles = {}
89        self.event_generator = self.parse_stream()
90
91    def check(self, *choices):
92        # Check the type of the next event.
93        if self.current_event is None:
94            try:
95                self.current_event = self.event_generator.next()
96            except StopIteration:
97                pass
98        if self.current_event is not None:
99            for choice in choices:
100                if isinstance(self.current_event, choice):
101                    return True
102        return False
103
104    def peek(self):
105        # Get the next event.
106        if self.current_event is None:
107            try:
108                self.current_event = self.event_generator.next()
109            except StopIteration:
110                pass
111        return self.current_event
112
113    def get(self):
114        # Get the next event.
115        if self.current_event is None:
116            try:
117                self.current_event = self.event_generator.next()
118            except StopIteration:
119                pass
120        value = self.current_event
121        self.current_event = None
122        return value
123
124    def __iter__(self):
125        # Iterator protocol.
126        return self.event_generator
127
128    def parse_stream(self):
129        # STREAM-START implicit_document? explicit_document* STREAM-END
130
131        # Parse start of stream.
132        token = self.scanner.get()
133        yield StreamStartEvent(token.start_mark, token.end_mark,
134                encoding=token.encoding)
135
136        # Parse implicit document.
137        if not self.scanner.check(DirectiveToken, DocumentStartToken,
138                StreamEndToken):
139            self.tag_handles = self.DEFAULT_TAGS
140            token = self.scanner.peek()
141            start_mark = end_mark = token.start_mark
142            yield DocumentStartEvent(start_mark, end_mark, implicit=True)
143            for event in self.parse_block_node():
144                yield event
145            token = self.scanner.peek()
146            start_mark = end_mark = token.start_mark
147            while self.scanner.check(DocumentEndToken):
148                token = self.scanner.get()
149                end_mark = token.end_mark
150            yield DocumentEndEvent(start_mark, end_mark)
151
152        # Parse explicit documents.
153        while not self.scanner.check(StreamEndToken):
154            token = self.scanner.peek()
155            start_mark = token.start_mark
156            version, tags = self.process_directives()
157            if not self.scanner.check(DocumentStartToken):
158                raise ParserError(None, None,
159                        "expected '<document start>', but found %r"
160                        % self.scanner.peek().id,
161                        self.scanner.peek().start_mark)
162            token = self.scanner.get()
163            end_mark = token.end_mark
164            yield DocumentStartEvent(start_mark, end_mark,
165                    implicit=False, version=version, tags=tags)
166            if self.scanner.check(DirectiveToken,
167                    DocumentStartToken, DocumentEndToken, StreamEndToken):
168                yield self.process_empty_scalar(token.end_mark)
169            else:
170                for event in self.parse_block_node():
171                    yield event
172            token = self.scanner.peek()
173            start_mark = end_mark = token.start_mark
174            while self.scanner.check(DocumentEndToken):
175                token = self.scanner.get()
176                end_mark = token.end_mark
177            yield DocumentEndEvent(start_mark, end_mark)
178
179        # Parse end of stream.
180        token = self.scanner.get()
181        yield StreamEndEvent(token.start_mark, token.end_mark)
182
183    def process_directives(self):
184        # DIRECTIVE*
185        self.yaml_version = None
186        self.tag_handles = {}
187        while self.scanner.check(DirectiveToken):
188            token = self.scanner.get()
189            if token.name == u'YAML':
190                if self.yaml_version is not None:
191                    raise ParserError(None, None,
192                            "found duplicate YAML directive", token.start_mark)
193                major, minor = token.value
194                if major != 1:
195                    raise ParserError(None, None,
196                            "found incompatible YAML document (version 1.* is required)",
197                            token.start_mark)
198                self.yaml_version = token.value
199            elif token.name == u'TAG':
200                handle, prefix = token.value
201                if handle in self.tag_handles:
202                    raise ParserError(None, None,
203                            "duplicate tag handle %r" % handle.encode('utf-8'),
204                            token.start_mark)
205                self.tag_handles[handle] = prefix
206        version_value = self.yaml_version
207        tags_value = None
208        if self.tag_handles:
209            tags_value = self.tag_handles.copy()
210        for key in self.DEFAULT_TAGS:
211            if key not in self.tag_handles:
212                self.tag_handles[key] = self.DEFAULT_TAGS[key]
213        return version_value, tags_value
214
215    def parse_block_node(self):
216        return self.parse_node(block=True)
217
218    def parse_flow_node(self):
219        return self.parse_node()
220
221    def parse_block_node_or_indentless_sequence(self):
222        return self.parse_node(block=True, indentless_sequence=True)
223
224    def parse_node(self, block=False, indentless_sequence=False):
225        # block_node    ::= ALIAS | properties? block_content
226        # flow_node     ::= ALIAS | properties? flow_content
227        # properties    ::= TAG ANCHOR? | ANCHOR TAG?
228        # block_content     ::= block_collection | flow_collection | SCALAR
229        # flow_content      ::= flow_collection | SCALAR
230        # block_collection  ::= block_sequence | block_mapping
231        # block_node_or_indentless_sequence ::= ALIAS | properties?
232        #                                       (block_content | indentless_block_sequence)
233        if self.scanner.check(AliasToken):
234            token = self.scanner.get()
235            yield AliasEvent(token.value, token.start_mark, token.end_mark)
236        else:
237            anchor = None
238            tag = None
239            start_mark = end_mark = tag_mark = None
240            if self.scanner.check(AnchorToken):
241                token = self.scanner.get()
242                start_mark = token.start_mark
243                end_mark = token.end_mark
244                anchor = token.value
245                if self.scanner.check(TagToken):
246                    token = self.scanner.get()
247                    tag_mark = token.start_mark
248                    end_mark = token.end_mark
249                    tag = token.value
250            elif self.scanner.check(TagToken):
251                token = self.scanner.get()
252                start_mark = tag_mark = token.start_mark
253                end_mark = token.end_mark
254                tag = token.value
255                if self.scanner.check(AnchorToken):
256                    token = self.scanner.get()
257                    end_mark = token.end_mark
258                    anchor = token.value
259            if tag is not None:
260                handle, suffix = tag
261                if handle is not None:
262                    if handle not in self.tag_handles:
263                        raise ParserError("while parsing a node", start_mark,
264                                "found undefined tag handle %r" % handle.encode('utf-8'),
265                                tag_mark)
266                    tag = self.tag_handles[handle]+suffix
267                else:
268                    tag = suffix
269            if tag is None:
270                if not (self.scanner.check(ScalarToken) and
271                        self.scanner.peek().plain):
272                    tag = u'!'
273            if start_mark is None:
274                start_mark = end_mark = self.scanner.peek().start_mark
275            event = None
276            collection_events = None
277            if indentless_sequence and self.scanner.check(BlockEntryToken):
278                end_mark = self.scanner.peek().end_mark
279                event = SequenceEvent(anchor, tag, start_mark, end_mark,
280                        flow=False, compact=False)
281                collection_events = self.parse_indentless_sequence()
282            else:
283                if self.scanner.check(ScalarToken):
284                    token = self.scanner.get()
285                    end_mark = token.end_mark
286                    event = ScalarEvent(anchor, tag, token.value,
287                            start_mark, end_mark,
288                            implicit=(tag is None), style=token.style)
289                elif self.scanner.check(FlowSequenceStartToken):
290                    end_mark = self.scanner.peek().end_mark
291                    event = SequenceEvent(anchor, tag, start_mark, end_mark,
292                            flow=True)
293                    collection_events = self.parse_flow_sequence()
294                elif self.scanner.check(FlowMappingStartToken):
295                    end_mark = self.scanner.peek().end_mark
296                    event = MappingEvent(anchor, tag, start_mark, end_mark,
297                            flow=True)
298                    collection_events = self.parse_flow_mapping()
299                elif block and self.scanner.check(BlockSequenceStartToken):
300                    end_mark = self.scanner.peek().start_mark
301                    compact = self.scanner.peek().inline
302                    event = SequenceEvent(anchor, tag, start_mark, end_mark,
303                            flow=False, compact=compact)
304                    collection_events = self.parse_block_sequence()
305                elif block and self.scanner.check(BlockMappingStartToken):
306                    end_mark = self.scanner.peek().start_mark
307                    compact = self.scanner.peek().inline
308                    event = MappingEvent(anchor, tag, start_mark, end_mark,
309                            flow=False, compact=compact)
310                    collection_events = self.parse_block_mapping()
311                elif anchor is not None or tag is not None:
312                    # Empty scalars are allowed even if a tag or an anchor is
313                    # specified.
314                    event = ScalarEvent(anchor, tag, u'', start_mark, end_mark,
315                            implicit=False, style='')
316                else:
317                    if block:
318                        node = 'block'
319                    else:
320                        node = 'flow'
321                    token = self.scanner.peek()
322                    raise ParserError("while scanning a %s node" % node, start_mark,
323                            "expected the node content, but found %r" % token.id,
324                            token.start_mark)
325            yield event
326            if collection_events is not None:
327                for event in collection_events:
328                    yield event
329
330    def parse_block_sequence(self):
331        # BLOCK-SEQUENCE-START (BLOCK-ENTRY block_node?)* BLOCK-END
332        token = self.scanner.get()
333        start_mark = token.start_mark
334        while self.scanner.check(BlockEntryToken):
335            token = self.scanner.get()
336            if not self.scanner.check(BlockEntryToken, BlockEndToken):
337                for event in self.parse_block_node():
338                    yield event
339            else:
340                yield self.process_empty_scalar(token.end_mark)
341        if not self.scanner.check(BlockEndToken):
342            token = self.scanner.peek()
343            raise ParserError("while scanning a block collection", start_mark,
344                    "expected <block end>, but found %r" % token.id, token.start_mark)
345        token = self.scanner.get()
346        yield CollectionEndEvent(token.start_mark, token.end_mark)
347
348    def parse_indentless_sequence(self):
349        # (BLOCK-ENTRY block_node?)+
350        while self.scanner.check(BlockEntryToken):
351            token = self.scanner.get()
352            if not self.scanner.check(BlockEntryToken,
353                    KeyToken, ValueToken, BlockEndToken):
354                for event in self.parse_block_node():
355                    yield event
356            else:
357                yield self.process_empty_scalar(token.end_mark)
358        token = self.scanner.peek()
359        yield CollectionEndEvent(token.start_mark, token.start_mark)
360
361    def parse_block_mapping(self):
362        # BLOCK-MAPPING_START
363        #   ((KEY block_node_or_indentless_sequence?)?
364        #   (VALUE block_node_or_indentless_sequence?)?)*
365        # BLOCK-END
366        token = self.scanner.get()
367        start_mark = token.start_mark
368        while self.scanner.check(KeyToken, ValueToken):
369            if self.scanner.check(KeyToken):
370                token = self.scanner.get()
371                if not self.scanner.check(KeyToken, ValueToken, BlockEndToken):
372                    for event in self.parse_block_node_or_indentless_sequence():
373                        yield event
374                else:
375                    yield self.process_empty_scalar(token.end_mark)
376            if self.scanner.check(ValueToken):
377                token = self.scanner.get()
378                if not self.scanner.check(KeyToken, ValueToken, BlockEndToken):
379                    for event in self.parse_block_node_or_indentless_sequence():
380                        yield event
381                else:
382                    yield self.process_empty_scalar(token.end_mark)
383            else:
384                token = self.scanner.peek()
385                yield self.process_empty_scalar(token.start_mark)
386        if not self.scanner.check(BlockEndToken):
387            token = self.scanner.peek()
388            raise ParserError("while scanning a block mapping", start_mark,
389                    "expected <block end>, but found %r" % token.id, token.start_mark)
390        token = self.scanner.get()
391        yield CollectionEndEvent(token.start_mark, token.end_mark)
392
393    def parse_flow_sequence(self):
394        # flow_sequence     ::= FLOW-SEQUENCE-START
395        #                       (flow_sequence_entry FLOW-ENTRY)*
396        #                       flow_sequence_entry?
397        #                       FLOW-SEQUENCE-END
398        # flow_sequence_entry   ::= flow_node | KEY flow_node? (VALUE flow_node?)?
399        #
400        # Note that while production rules for both flow_sequence_entry and
401        # flow_mapping_entry are equal, their interpretations are different.
402        # For `flow_sequence_entry`, the part `KEY flow_node? (VALUE flow_node?)?`
403        # generate an inline mapping (set syntax).
404        token = self.scanner.get()
405        start_mark = token.start_mark
406        while not self.scanner.check(FlowSequenceEndToken):
407            if self.scanner.check(KeyToken):
408                token = self.scanner.get()
409                yield MappingEvent(None, u'!',
410                        token.start_mark, token.end_mark,
411                        flow=True, compact=True)
412                if not self.scanner.check(ValueToken,
413                        FlowEntryToken, FlowSequenceEndToken):
414                    for event in self.parse_flow_node():
415                        yield event
416                else:
417                    yield self.process_empty_scalar(token.end_mark)
418                if self.scanner.check(ValueToken):
419                    token = self.scanner.get()
420                    if not self.scanner.check(FlowEntryToken, FlowSequenceEndToken):
421                        for event in self.parse_flow_node():
422                            yield event
423                    else:
424                        yield self.process_empty_scalar(token.end_mark)
425                else:
426                    token = self.scanner.peek()
427                    yield self.process_empty_scalar(token.start_mark)
428                token = self.scanner.peek()
429                yield CollectionEndEvent(token.start_mark, token.start_mark)
430            else:
431                for event in self.parse_flow_node():
432                    yield event
433            if not self.scanner.check(FlowEntryToken, FlowSequenceEndToken):
434                token = self.scanner.peek()
435                raise ParserError("while scanning a flow sequence", start_mark,
436                        "expected ',' or ']', but got %r" % token.id, token.start_mark)
437            if self.scanner.check(FlowEntryToken):
438                self.scanner.get()
439        token = self.scanner.get()
440        yield CollectionEndEvent(token.start_mark, token.end_mark)
441
442    def parse_flow_mapping(self):
443        # flow_mapping      ::= FLOW-MAPPING-START
444        #                       (flow_mapping_entry FLOW-ENTRY)*
445        #                       flow_mapping_entry?
446        #                       FLOW-MAPPING-END
447        # flow_mapping_entry    ::= flow_node | KEY flow_node? (VALUE flow_node?)?
448        token = self.scanner.get()
449        start_mark = token.start_mark
450        while not self.scanner.check(FlowMappingEndToken):
451            if self.scanner.check(KeyToken):
452                token = self.scanner.get()
453                if not self.scanner.check(ValueToken,
454                        FlowEntryToken, FlowMappingEndToken):
455                    for event in self.parse_flow_node():
456                        yield event
457                else:
458                    yield self.process_empty_scalar(token.end_mark)
459                if self.scanner.check(ValueToken):
460                    token = self.scanner.get()
461                    if not self.scanner.check(FlowEntryToken, FlowMappingEndToken):
462                        for event in self.parse_flow_node():
463                            yield event
464                    else:
465                        yield self.process_empty_scalar(token.end_mark)
466                else:
467                    token = self.scanner.peek()
468                    yield self.process_empty_scalar(token.start_mark)
469            else:
470                for event in self.parse_flow_node():
471                    yield event
472                yield self.process_empty_scalar(self.scanner.peek().start_mark)
473            if not self.scanner.check(FlowEntryToken, FlowMappingEndToken):
474                token = self.scanner.peek()
475                raise ParserError("while scanning a flow mapping", start_mark,
476                        "expected ',' or '}', but got %r" % token.id, token.start_mark)
477            if self.scanner.check(FlowEntryToken):
478                self.scanner.get()
479        if not self.scanner.check(FlowMappingEndToken):
480            token = self.scanner.peek()
481            raise ParserError("while scanning a flow mapping", start_mark,
482                    "expected '}', but found %r" % token.id, token.start_mark)
483        token = self.scanner.get()
484        yield CollectionEndEvent(token.start_mark, token.end_mark)
485
486    def process_empty_scalar(self, mark):
487        return ScalarEvent(None, None, u'', mark, mark, implicit=True)
488
Note: See TracBrowser for help on using the repository browser.