source: pyyaml/trunk/lib/yaml/parser.py @ 118

Revision 118, 21.0 KB checked in by xi, 8 years ago (diff)
  • Add the token STREAM-START.
  • Add parsing events: STREAM-START, DOCUMENT-START, DOCUMENT-END.
Line 
1
2# YAML can be parsed by an LL(1) parser!
3#
4# We use the following production rules:
5# stream            ::= STREAM-START implicit_document? explicit_document* STREAM-END
6# explicit_document ::= DIRECTIVE* DOCUMENT-START block_node? DOCUMENT-END?
7# implicit_document ::= block_node DOCUMENT-END?
8# block_node    ::= ALIAS | properties? block_content
9# flow_node     ::= ALIAS | properties? flow_content
10# properties    ::= TAG ANCHOR? | ANCHOR TAG?
11# block_content     ::= block_collection | flow_collection | SCALAR
12# flow_content      ::= flow_collection | SCALAR
13# block_collection  ::= block_sequence | block_mapping
14# block_sequence    ::= BLOCK-SEQUENCE-START (BLOCK-ENTRY block_node?)* BLOCK-END
15# block_mapping     ::= BLOCK-MAPPING_START ((KEY block_node_or_indentless_sequence?)? (VALUE block_node_or_indentless_sequence?)?)* BLOCK-END
16# block_node_or_indentless_sequence ::= ALIAS | properties? (block_content | indentless_block_sequence)
17# indentless_block_sequence         ::= (BLOCK-ENTRY block_node?)+
18# flow_collection   ::= flow_sequence | flow_mapping
19# flow_sequence     ::= FLOW-SEQUENCE-START (flow_sequence_entry FLOW-ENTRY)* flow_sequence_entry? FLOW-SEQUENCE-END
20# flow_mapping      ::= FLOW-MAPPING-START (flow_mapping_entry FLOW-ENTRY)* flow_mapping_entry? FLOW-MAPPING-END
21# flow_sequence_entry   ::= flow_node | KEY flow_node? (VALUE flow_node?)?
22# flow_mapping_entry    ::= flow_node | KEY flow_node? (VALUE flow_node?)?
23
24# TODO: support for BOM within a stream.
25# stream ::= (BOM? implicit_document)? (BOM? explicit_document)* STREAM-END
26
27# Note that there is a slight deviation from the specification. We require a
28# non-empty node content if ANCHOR or TAG is specified. This disallow such
29# documents as
30#
31#   key:    !!str   # empty value
32#
33# This is done to prevent ambiguity in parsing tags and aliases:
34#
35#   {   !!perl/YAML::Parser:    value }
36#
37# What is it? Should it be interpreted as
38#   {   ? !<tag:yaml.org,2002:perl/YAML::Parser> '' : value }
39# or
40#   {   ? !<tag:yaml.org,2002:perl/YAML::Parser:> value : '' }
41# Since we disallow non-empty node content, tags are always followed by spaces
42# or line breaks.
43
44# FIRST sets:
45# stream: { STREAM-START }
46# explicit_document: { DIRECTIVE DOCUMENT-START }
47# implicit_document: FIRST(block_node)
48# block_node: { ALIAS TAG ANCHOR SCALAR BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START }
49# flow_node: { ALIAS ANCHOR TAG SCALAR FLOW-SEQUENCE-START FLOW-MAPPING-START }
50# block_content: { BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START SCALAR }
51# flow_content: { FLOW-SEQUENCE-START FLOW-MAPPING-START SCALAR }
52# block_collection: { BLOCK-SEQUENCE-START BLOCK-MAPPING-START }
53# flow_collection: { FLOW-SEQUENCE-START FLOW-MAPPING-START }
54# block_sequence: { BLOCK-SEQUENCE-START }
55# block_mapping: { BLOCK-MAPPING-START }
56# block_node_or_indentless_sequence: { ALIAS ANCHOR TAG SCALAR BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START BLOCK-ENTRY }
57# indentless_sequence: { ENTRY }
58# flow_collection: { FLOW-SEQUENCE-START FLOW-MAPPING-START }
59# flow_sequence: { FLOW-SEQUENCE-START }
60# flow_mapping: { FLOW-MAPPING-START }
61# flow_sequence_entry: { ALIAS ANCHOR TAG SCALAR FLOW-SEQUENCE-START FLOW-MAPPING-START KEY }
62# flow_mapping_entry: { ALIAS ANCHOR TAG SCALAR FLOW-SEQUENCE-START FLOW-MAPPING-START KEY }
63
64__all__ = ['Parser', 'ParserError']
65
66from error import MarkedYAMLError
67from tokens import *
68from events import *
69
70class ParserError(MarkedYAMLError):
71    pass
72
73class Parser:
74    # Since writing an LL(1) parser is a straightforward task, we do not give
75    # many comments here.
76    # Note that we use Python generators. If you rewrite the parser in another
77    # language, you may replace all 'yield'-s with event handler calls.
78
79    DEFAULT_TAGS = {
80        u'!':   u'!',
81        u'!!':  u'tag:yaml.org,2002:',
82    }
83
84    def __init__(self, scanner):
85        self.scanner = scanner
86        self.current_event = None
87        self.yaml_version = None
88        self.tag_handles = {}
89        self.event_generator = self.parse_stream()
90
91    def check(self, *choices):
92        # Check the type of the next event.
93        if self.current_event is None:
94            try:
95                self.current_event = self.event_generator.next()
96            except StopIteration:
97                pass
98        if self.current_event is not None:
99            for choice in choices:
100                if isinstance(self.current_event, choice):
101                    return True
102        return False
103
104    def peek(self):
105        # Get the next event.
106        if self.current_event is None:
107            try:
108                self.current_event = self.event_generator.next()
109            except StopIteration:
110                pass
111        return self.current_event
112
113    def get(self):
114        # Get the next event.
115        if self.current_event is None:
116            try:
117                self.current_event = self.event_generator.next()
118            except StopIteration:
119                pass
120        value = self.current_event
121        self.current_event = None
122        return value
123
124    def __iter__(self):
125        # Iterator protocol.
126        return self.event_generator
127
128    def parse_stream(self):
129        # STREAM-START implicit_document? explicit_document* STREAM-END
130
131        # Parse start of stream.
132        token = self.scanner.get()
133        yield StreamStartEvent(token.start_mark, token.end_mark)
134
135        # Parse implicit document.
136        if not self.scanner.check(DirectiveToken, DocumentStartToken,
137                StreamEndToken):
138            self.tag_handles = self.DEFAULT_TAGS
139            token = self.scanner.peek()
140            start_mark = end_mark = token.start_mark
141            yield DocumentStartEvent(start_mark, end_mark)
142            for event in self.parse_block_node():
143                yield event
144            token = self.scanner.peek()
145            start_mark = end_mark = token.start_mark
146            while self.scanner.check(DocumentEndToken):
147                token = self.scanner.get()
148                end_mark = token.end_mark
149            yield DocumentEndEvent(start_mark, end_mark)
150
151        # Parse explicit documents.
152        while not self.scanner.check(StreamEndToken):
153            token = self.scanner.peek()
154            start_mark = token.start_mark
155            self.process_directives()
156            if not self.scanner.check(DocumentStartToken):
157                raise ParserError(None, None,
158                        "expected '<document start>', but found %r"
159                        % self.scanner.peek().id,
160                        self.scanner.peek().start_mark)
161            token = self.scanner.get()
162            end_mark = token.end_mark
163            yield DocumentStartEvent(start_mark, end_mark)
164            if self.scanner.check(DirectiveToken,
165                    DocumentStartToken, DocumentEndToken, StreamEndToken):
166                yield self.process_empty_scalar(token.end_mark)
167            else:
168                for event in self.parse_block_node():
169                    yield event
170            token = self.scanner.peek()
171            start_mark = end_mark = token.start_mark
172            while self.scanner.check(DocumentEndToken):
173                token = self.scanner.get()
174                end_mark = token.end_mark
175            yield DocumentEndEvent(start_mark, end_mark)
176
177        # Parse end of stream.
178        token = self.scanner.get()
179        yield StreamEndEvent(token.start_mark, token.end_mark)
180
181    def process_directives(self):
182        # DIRECTIVE*
183        self.yaml_version = None
184        self.tag_handles = {}
185        while self.scanner.check(DirectiveToken):
186            token = self.scanner.get()
187            if token.name == u'YAML':
188                if self.yaml_version is not None:
189                    raise ParserError(None, None,
190                            "found duplicate YAML directive", token.start_mark)
191                major, minor = token.value
192                if major != 1:
193                    raise ParserError(None, None,
194                            "found incompatible YAML document (version 1.* is required)",
195                            token.start_mark)
196                self.yaml_version = token.value
197            elif token.name == u'TAG':
198                handle, prefix = token.value
199                if handle in self.tag_handles:
200                    raise ParserError(None, None,
201                            "duplicate tag handle %r" % handle.encode('utf-8'),
202                            token.start_mark)
203                self.tag_handles[handle] = prefix
204        for key in self.DEFAULT_TAGS:
205            if key not in self.tag_handles:
206                self.tag_handles[key] = self.DEFAULT_TAGS[key]
207
208    def parse_block_node(self):
209        return self.parse_node(block=True)
210
211    def parse_flow_node(self):
212        return self.parse_node()
213
214    def parse_block_node_or_indentless_sequence(self):
215        return self.parse_node(block=True, indentless_sequence=True)
216
217    def parse_node(self, block=False, indentless_sequence=False):
218        # block_node    ::= ALIAS | properties? block_content
219        # flow_node     ::= ALIAS | properties? flow_content
220        # properties    ::= TAG ANCHOR? | ANCHOR TAG?
221        # block_content     ::= block_collection | flow_collection | SCALAR
222        # flow_content      ::= flow_collection | SCALAR
223        # block_collection  ::= block_sequence | block_mapping
224        # block_node_or_indentless_sequence ::= ALIAS | properties?
225        #                                       (block_content | indentless_block_sequence)
226        if self.scanner.check(AliasToken):
227            token = self.scanner.get()
228            yield AliasEvent(token.value, token.start_mark, token.end_mark)
229        else:
230            anchor = None
231            tag = None
232            start_mark = end_mark = tag_mark = None
233            if self.scanner.check(AnchorToken):
234                token = self.scanner.get()
235                start_mark = end_mark = token.start_mark
236                anchor = token.value
237                if self.scanner.check(TagToken):
238                    token = self.scanner.get()
239                    end_mark = tag_mark = token.start_mark
240                    tag = token.value
241            elif self.scanner.check(TagToken):
242                token = self.scanner.get()
243                start_mark = end_mark = tag_mark = token.start_mark
244                tag = token.value
245                if self.scanner.check(AnchorToken):
246                    token = self.scanner.get()
247                    end_mark = token.start_mark
248                    anchor = token.value
249            if tag is not None:
250                handle, suffix = tag
251                if handle is not None:
252                    if handle not in self.tag_handles:
253                        raise ParserError("while parsing a node", start_mark,
254                                "found undefined tag handle %r" % handle.encode('utf-8'),
255                                tag_mark)
256                    tag = self.tag_handles[handle]+suffix
257                else:
258                    tag = suffix
259            if tag is None:
260                if not (self.scanner.check(ScalarToken) and
261                        self.scanner.peek().plain):
262                    tag = u'!'
263            if start_mark is None:
264                start_mark = self.scanner.peek().start_mark
265            event = None
266            collection_events = None
267            if indentless_sequence and self.scanner.check(BlockEntryToken):
268                end_mark = self.scanner.peek().end_mark
269                event = SequenceEvent(anchor, tag, start_mark, end_mark)
270                collection_events = self.parse_indentless_sequence()
271            else:
272                if self.scanner.check(ScalarToken):
273                    token = self.scanner.get()
274                    end_mark = token.end_mark
275                    event = ScalarEvent(anchor, tag, token.value,
276                            start_mark, end_mark)
277                elif self.scanner.check(FlowSequenceStartToken):
278                    end_mark = self.scanner.peek().end_mark
279                    event = SequenceEvent(anchor, tag, start_mark, end_mark)
280                    collection_events = self.parse_flow_sequence()
281                elif self.scanner.check(FlowMappingStartToken):
282                    end_mark = self.scanner.peek().end_mark
283                    event = MappingEvent(anchor, tag, start_mark, end_mark)
284                    collection_events = self.parse_flow_mapping()
285                elif block and self.scanner.check(BlockSequenceStartToken):
286                    end_mark = self.scanner.peek().start_mark
287                    event = SequenceEvent(anchor, tag, start_mark, end_mark)
288                    collection_events = self.parse_block_sequence()
289                elif block and self.scanner.check(BlockMappingStartToken):
290                    end_mark = self.scanner.peek().start_mark
291                    event = MappingEvent(anchor, tag, start_mark, end_mark)
292                    collection_events = self.parse_block_mapping()
293                else:
294                    if block:
295                        node = 'block'
296                    else:
297                        node = 'flow'
298                    token = self.scanner.peek()
299                    raise ParserError("while scanning a %s node" % node, start_mark,
300                            "expected the node content, but found %r" % token.id,
301                            token.start_mark)
302            yield event
303            if collection_events is not None:
304                for event in collection_events:
305                    yield event
306
307    def parse_block_sequence(self):
308        # BLOCK-SEQUENCE-START (BLOCK-ENTRY block_node?)* BLOCK-END
309        token = self.scanner.get()
310        start_mark = token.start_mark
311        while self.scanner.check(BlockEntryToken):
312            token = self.scanner.get()
313            if not self.scanner.check(BlockEntryToken, BlockEndToken):
314                for event in self.parse_block_node():
315                    yield event
316            else:
317                yield self.process_empty_scalar(token.end_mark)
318        if not self.scanner.check(BlockEndToken):
319            token = self.scanner.peek()
320            raise ParserError("while scanning a block collection", start_mark,
321                    "expected <block end>, but found %r" % token.id, token.start_mark)
322        token = self.scanner.get()
323        yield CollectionEndEvent(token.start_mark, token.end_mark)
324
325    def parse_indentless_sequence(self):
326        # (BLOCK-ENTRY block_node?)+
327        while self.scanner.check(BlockEntryToken):
328            token = self.scanner.get()
329            if not self.scanner.check(BlockEntryToken,
330                    KeyToken, ValueToken, BlockEndToken):
331                for event in self.parse_block_node():
332                    yield event
333            else:
334                yield self.process_empty_scalar(token.end_mark)
335        token = self.scanner.peek()
336        yield CollectionEndEvent(token.start_mark, token.start_mark)
337
338    def parse_block_mapping(self):
339        # BLOCK-MAPPING_START
340        #   ((KEY block_node_or_indentless_sequence?)?
341        #   (VALUE block_node_or_indentless_sequence?)?)*
342        # BLOCK-END
343        token = self.scanner.get()
344        start_mark = token.start_mark
345        while self.scanner.check(KeyToken, ValueToken):
346            if self.scanner.check(KeyToken):
347                token = self.scanner.get()
348                if not self.scanner.check(KeyToken, ValueToken, BlockEndToken):
349                    for event in self.parse_block_node_or_indentless_sequence():
350                        yield event
351                else:
352                    yield self.process_empty_scalar(token.end_mark)
353            if self.scanner.check(ValueToken):
354                token = self.scanner.get()
355                if not self.scanner.check(KeyToken, ValueToken, BlockEndToken):
356                    for event in self.parse_block_node_or_indentless_sequence():
357                        yield event
358                else:
359                    yield self.process_empty_scalar(token.end_mark)
360            else:
361                token = self.scanner.peek()
362                yield self.process_empty_scalar(token.start_mark)
363        if not self.scanner.check(BlockEndToken):
364            token = self.scanner.peek()
365            raise ParserError("while scanning a block mapping", start_mark,
366                    "expected <block end>, but found %r" % token.id, token.start_mark)
367        token = self.scanner.get()
368        yield CollectionEndEvent(token.start_mark, token.end_mark)
369
370    def parse_flow_sequence(self):
371        # flow_sequence     ::= FLOW-SEQUENCE-START
372        #                       (flow_sequence_entry FLOW-ENTRY)*
373        #                       flow_sequence_entry?
374        #                       FLOW-SEQUENCE-END
375        # flow_sequence_entry   ::= flow_node | KEY flow_node? (VALUE flow_node?)?
376        #
377        # Note that while production rules for both flow_sequence_entry and
378        # flow_mapping_entry are equal, their interpretations are different.
379        # For `flow_sequence_entry`, the part `KEY flow_node? (VALUE flow_node?)?`
380        # generate an inline mapping (set syntax).
381        token = self.scanner.get()
382        start_mark = token.start_mark
383        while not self.scanner.check(FlowSequenceEndToken):
384            if self.scanner.check(KeyToken):
385                token = self.scanner.get()
386                yield MappingEvent(None, u'!',
387                        token.start_mark, token.end_mark)
388                if not self.scanner.check(ValueToken,
389                        FlowEntryToken, FlowSequenceEndToken):
390                    for event in self.parse_flow_node():
391                        yield event
392                else:
393                    yield self.process_empty_scalar(token.end_mark)
394                if self.scanner.check(ValueToken):
395                    token = self.scanner.get()
396                    if not self.scanner.check(FlowEntryToken, FlowSequenceEndToken):
397                        for event in self.parse_flow_node():
398                            yield event
399                    else:
400                        yield self.process_empty_scalar(token.end_mark)
401                else:
402                    token = self.scanner.peek()
403                    yield self.process_empty_scalar(token.start_mark)
404                token = self.scanner.peek()
405                yield CollectionEndEvent(token.start_mark, token.start_mark)
406            else:
407                for event in self.parse_flow_node():
408                    yield event
409            if not self.scanner.check(FlowEntryToken, FlowSequenceEndToken):
410                token = self.scanner.peek()
411                raise ParserError("while scanning a flow sequence", start_mark,
412                        "expected ',' or ']', but got %r" % token.id, token.start_mark)
413            if self.scanner.check(FlowEntryToken):
414                self.scanner.get()
415        token = self.scanner.get()
416        yield CollectionEndEvent(token.start_mark, token.end_mark)
417
418    def parse_flow_mapping(self):
419        # flow_mapping      ::= FLOW-MAPPING-START
420        #                       (flow_mapping_entry FLOW-ENTRY)*
421        #                       flow_mapping_entry?
422        #                       FLOW-MAPPING-END
423        # flow_mapping_entry    ::= flow_node | KEY flow_node? (VALUE flow_node?)?
424        token = self.scanner.get()
425        start_mark = token.start_mark
426        while not self.scanner.check(FlowMappingEndToken):
427            if self.scanner.check(KeyToken):
428                token = self.scanner.get()
429                if not self.scanner.check(ValueToken,
430                        FlowEntryToken, FlowMappingEndToken):
431                    for event in self.parse_flow_node():
432                        yield event
433                else:
434                    yield self.process_empty_scalar(token.end_mark)
435                if self.scanner.check(ValueToken):
436                    token = self.scanner.get()
437                    if not self.scanner.check(FlowEntryToken, FlowMappingEndToken):
438                        for event in self.parse_flow_node():
439                            yield event
440                    else:
441                        yield self.process_empty_scalar(token.end_mark)
442                else:
443                    token = self.scanner.peek()
444                    yield self.process_empty_scalar(token.start_mark)
445            else:
446                for event in self.parse_flow_node():
447                    yield event
448                yield self.process_empty_scalar(self.scanner.peek().start_mark)
449            if not self.scanner.check(FlowEntryToken, FlowMappingEndToken):
450                token = self.scanner.peek()
451                raise ParserError("while scanning a flow mapping", start_mark,
452                        "expected ',' or '}', but got %r" % token.id, token.start_mark)
453            if self.scanner.check(FlowEntryToken):
454                self.scanner.get()
455        if not self.scanner.check(FlowMappingEndToken):
456            token = self.scanner.peek()
457            raise ParserError("while scanning a flow mapping", start_mark,
458                    "expected '}', but found %r" % token.id, token.start_mark)
459        token = self.scanner.get()
460        yield CollectionEndEvent(token.start_mark, token.end_mark)
461
462    def process_empty_scalar(self, mark):
463        return ScalarEvent(None, None, u'', mark, mark)
464
Note: See TracBrowser for help on using the repository browser.