source: branches/pyyaml3000/lib/yaml/parser.py @ 51

Revision 51, 20.7 KB checked in by xi, 9 years ago (diff)

Parser is done. Add iterator interfaces for Scanner and Parser.

RevLine 
[43]1
[51]2# YAML can be parsed by an LL(1) parser!
3#
4# We use the following production rules:
5# stream            ::= implicit_document? explicit_document* STREAM-END
[43]6# explicit_document ::= DIRECTIVE* DOCUMENT-START block_node? DOCUMENT-END?
7# implicit_document ::= block_node DOCUMENT-END?
8# block_node    ::= ALIAS | properties? block_content
9# flow_node     ::= ALIAS | properties? flow_content
10# properties    ::= TAG ANCHOR? | ANCHOR TAG?
11# block_content     ::= block_collection | flow_collection | SCALAR
12# flow_content      ::= flow_collection | SCALAR
13# block_collection  ::= block_sequence | block_mapping
[51]14# block_sequence    ::= BLOCK-SEQUENCE-START (BLOCK-ENTRY block_node?)* BLOCK-END
[43]15# block_mapping     ::= BLOCK-MAPPING_START ((KEY block_node_or_indentless_sequence?)? (VALUE block_node_or_indentless_sequence?)?)* BLOCK-END
16# block_node_or_indentless_sequence ::= ALIAS | properties? (block_content | indentless_block_sequence)
[51]17# indentless_block_sequence         ::= (BLOCK-ENTRY block_node?)+
[43]18# flow_collection   ::= flow_sequence | flow_mapping
[51]19# flow_sequence     ::= FLOW-SEQUENCE-START (flow_sequence_entry FLOW-ENTRY)* flow_sequence_entry? FLOW-SEQUENCE-END
20# flow_mapping      ::= FLOW-MAPPING-START (flow_mapping_entry FLOW-ENTRY)* flow_mapping_entry? FLOW-MAPPING-END
21# flow_sequence_entry   ::= flow_node | KEY flow_node? (VALUE flow_node?)?
22# flow_mapping_entry    ::= flow_node | KEY flow_node? (VALUE flow_node?)?
23#
24# Note that there is a slight deviation from the specification. We require a
25# non-empty node content if ANCHOR or TAG is specified. This disallow such
26# documents as
27#
28#   key:    !!str   # empty value
29#
30# This is done to prevent ambiguity in parsing tags and aliases:
31#
32#   {   !!perl/YAML::Parser:    value }
33#
34# What is it? Should it be interpreted as
35#   {   ? !<tag:yaml.org,2002:perl/YAML::Parser> '' : value }
36# or
37#   {   ? !<tag:yaml.org,2002:perl/YAML::Parser:> value : '' }
38# Since we disallow non-empty node content, tags are always followed by spaces
39# or line breaks.
[43]40
[51]41# FIRST sets:
42# stream: FIRST(block_node) + { DIRECTIVE DOCUMENT-START }
[43]43# explicit_document: { DIRECTIVE DOCUMENT-START }
[51]44# implicit_document: FIRST(block_node)
[43]45# block_node: { ALIAS TAG ANCHOR SCALAR BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START }
46# flow_node: { ALIAS ANCHOR TAG SCALAR FLOW-SEQUENCE-START FLOW-MAPPING-START }
47# block_content: { BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START SCALAR }
48# flow_content: { FLOW-SEQUENCE-START FLOW-MAPPING-START SCALAR }
49# block_collection: { BLOCK-SEQUENCE-START BLOCK-MAPPING-START }
50# flow_collection: { FLOW-SEQUENCE-START FLOW-MAPPING-START }
51# block_sequence: { BLOCK-SEQUENCE-START }
52# block_mapping: { BLOCK-MAPPING-START }
[51]53# block_node_or_indentless_sequence: { ALIAS ANCHOR TAG SCALAR BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START BLOCK-ENTRY }
[43]54# indentless_sequence: { ENTRY }
55# flow_collection: { FLOW-SEQUENCE-START FLOW-MAPPING-START }
56# flow_sequence: { FLOW-SEQUENCE-START }
57# flow_mapping: { FLOW-MAPPING-START }
58# flow_sequence_entry: { ALIAS ANCHOR TAG SCALAR FLOW-SEQUENCE-START FLOW-MAPPING-START KEY }
59# flow_mapping_entry: { ALIAS ANCHOR TAG SCALAR FLOW-SEQUENCE-START FLOW-MAPPING-START KEY }
60
[46]61from error import YAMLError
62from tokens import *
[51]63from events import *
[44]64
[46]65class ParserError(YAMLError):
[44]66
[51]67    def __init__(self, context=None, context_marker=None,
68            problem=None, problem_marker=None):
69        self.context = context
70        self.context_marker = context_marker
71        self.problem = problem
72        self.problem_marker = problem_marker
[44]73
[51]74    def __str__(self):
75        lines = []
76        for (place, marker) in [(self.context, self.context_marker),
77                                (self.problem, self.problem_marker)]:
78            if place is not None:
79                lines.append(place)
80                if marker is not None:
81                    lines.append(str(marker))
82        return '\n'.join(lines)
[44]83
[51]84class Parser:
85    # Since writing an LL(1) parser is a straightforward task, we do not give
86    # many comments here.
87    # Note that we use Python generators. If you rewrite the parser to another
88    # language, you may replace all 'yield'-s with event handler calls.
[44]89
[51]90    DEFAULT_TAGS = {
91        u'!':   u'!',
92        u'!!':  u'tag:yaml.org,2002:',
93    }
[44]94
[46]95    def __init__(self, scanner):
96        self.scanner = scanner
[51]97        self.current_event = None
98        self.yaml_version = None
99        self.tag_handles = {}
100        self.event_generator = self.parse_stream()
[43]101
[51]102    def check(self, *choices):
103        # Check the type of the next event.
104        if self.current_event is None:
105            try:
106                self.current_event = self.event_generator.next()
107            except StopIteration:
108                pass
109        if self.current_event is not None:
110            for choice in choices:
111                if isinstance(self.current_event, choice):
112                    return True
[44]113        return False
114
[51]115    def get(self):
116        # Get the next event.
117        if self.current_event is None:
118            try:
119                self.current_event = self.event_generator.next()
120            except StopIteration:
121                pass
122        value = self.current_event
123        self.current_event = None
124        return value
[44]125
[51]126    def __iter__(self):
127        # Iterator protocol.
128        return self.event_generator
[44]129
[43]130    def parse_stream(self):
[51]131        # implicit_document? explicit_document* STREAM-END
132
133        # Parse implicit document.
134        if not self.scanner.check(DirectiveToken, DocumentStartToken,
135                StreamEndToken):
136            self.tag_handles = self.DEFAULT_TAGS
137            for event in self.parse_block_node():
138                yield event
139
140        # Parse explicit documents.
141        while not self.scanner.check(StreamEndToken):
142            self.process_directives()
143            if not self.scanner.check(DocumentStartToken):
144                raise ParserError(None, None,
145                        "expected '<document start>', but found %r"
146                        % self.scanner.peek().id,
147                        self.scanner.peek().start_marker)
148            token = self.scanner.get()
149            if self.scanner.check(DirectiveToken,
[47]150                    DocumentStartToken, DocumentEndToken, StreamEndToken):
[51]151                yield self.process_empty_scalar(token.end_marker)
[43]152            else:
[51]153                for event in self.parse_block_node():
154                    yield event
155            while self.scanner.check(DocumentEndToken):
156                self.scanner.get()
[43]157
[51]158        # Parse end of stream.
159        token = self.scanner.get()
160        yield StreamEndEvent(token.start_marker, token.end_marker)
161
162    def process_directives(self):
163        # DIRECTIVE*
164        self.yaml_version = None
165        self.tag_handles = {}
166        while self.scanner.check(DirectiveToken):
167            token = self.scanner.get()
168            if token.name == u'YAML':
169                if self.yaml_version is not None:
170                    raise ParserError(None, None,
171                            "found duplicate YAML directive", token.start_marker())
172                major, minor = token.value
173                if major != 1:
174                    raise ParserError(None, None,
175                            "found incompatible YAML document (version 1.* is required)",
176                            token.start_marker())
177                self.yaml_version = token.value
178            elif token.name == u'TAG':
179                handle, prefix = token.value
180                if handle in self.tag_handles:
181                    raise ParserError(None, None,
182                            "duplicate tag handle %r" % handle.encode('utf-8'),
183                            token.start_marker())
184                self.tag_handles[handle] = prefix
185        for key in self.DEFAULT_TAGS:
186            if key not in self.tag_handles:
187                self.tag_handles[key] = self.DEFAULT_TAGS[key]
188
[43]189    def parse_block_node(self):
[44]190        return self.parse_node(block=True)
[43]191
192    def parse_flow_node(self):
[44]193        return self.parse_node()
[43]194
195    def parse_block_node_or_indentless_sequence(self):
[44]196        return self.parse_node(block=True, indentless_sequence=True)
[43]197
[44]198    def parse_node(self, block=False, indentless_sequence=False):
[51]199        # block_node    ::= ALIAS | properties? block_content
200        # flow_node     ::= ALIAS | properties? flow_content
201        # properties    ::= TAG ANCHOR? | ANCHOR TAG?
202        # block_content     ::= block_collection | flow_collection | SCALAR
203        # flow_content      ::= flow_collection | SCALAR
204        # block_collection  ::= block_sequence | block_mapping
205        # block_node_or_indentless_sequence ::= ALIAS | properties?
206        #                                       (block_content | indentless_block_sequence)
207        if self.scanner.check(AliasToken):
208            token = self.scanner.get()
209            yield AliasEvent(token.value, token.start_marker, token.end_marker)
[44]210        else:
[51]211            anchor = None
212            tag = None
213            start_marker = end_marker = tag_marker = None
214            if self.scanner.check(AnchorToken):
215                token = self.scanner.get()
216                start_marker = end_marker = token.start_marker
217                anchor = token.value
218                if self.scanner.check(TagToken):
219                    token = self.scanner.get()
220                    end_marker = tag_marker = token.start_marker
221                    tag = token.value
222            elif self.scanner.check(TagToken):
223                token = self.scanner.get()
224                start_marker = end_marker = tag_marker = token.start_marker
225                tag = token.value
226                if self.scanner.check(AnchorToken):
227                    token = self.scanner.get()
228                    end_marker = token.start_marker
229                    anchor = token.value
230            if tag is not None:
231                handle, suffix = tag
232                if handle is not None:
233                    if handle not in self.tag_handles:
234                        raise ParserError("while parsing a node", start_marker,
235                                "found undefined tag handle %r" % handle.encode('utf-8'),
236                                tag_marker)
237                    tag = self.tag_handles[handle]+suffix
238                else:
239                    tag = suffix
240            if tag is None:
241                if not (self.scanner.check(ScalarToken) and
242                        self.scanner.peek().plain):
243                    tag = u'!'
244            if start_marker is None:
245                start_marker = self.scanner.peek().start_marker
246            event = None
247            collection_events = None
248            if indentless_sequence and self.scanner.check(BlockEntryToken):
249                end_marker = self.scanner.peek().end_marker
250                event = SequenceEvent(anchor, tag, start_marker, end_marker)
251                collection_events = self.parse_indentless_sequence()
[44]252            else:
[51]253                if self.scanner.check(ScalarToken):
254                    token = self.scanner.get()
255                    end_marker = token.end_marker
256                    event = ScalarEvent(anchor, tag, token.value,
257                            start_marker, end_marker)
258                elif self.scanner.check(FlowSequenceStartToken):
259                    end_marker = self.scanner.peek().end_marker
260                    event = SequenceEvent(anchor, tag, start_marker, end_marker)
261                    collection_events = self.parse_flow_sequence()
262                elif self.scanner.check(FlowMappingStartToken):
263                    end_marker = self.scanner.peek().end_marker
264                    event = MappingEvent(anchor, tag, start_marker, end_marker)
265                    collection_events = self.parse_flow_mapping()
266                elif block and self.scanner.check(BlockSequenceStartToken):
267                    end_marker = self.scanner.peek().start_marker
268                    event = SequenceEvent(anchor, tag, start_marker, end_marker)
269                    collection_events = self.parse_block_sequence()
270                elif block and self.scanner.check(BlockMappingStartToken):
271                    end_marker = self.scanner.peek().start_marker
272                    event = MappingEvent(anchor, tag, start_marker, end_marker)
273                    collection_events = self.parse_block_mapping()
274                else:
275                    if block:
276                        node = 'block'
277                    else:
278                        node = 'flow'
279                    token = self.scanner.peek()
280                    raise ParserError("while scanning a %s node" % node, start_marker,
281                            "expected the node content, but found %r" % token.id,
282                            token.start_marker)
283            yield event
284            if collection_events is not None:
285                for event in collection_events:
286                    yield event
[44]287
[43]288    def parse_block_sequence(self):
[51]289        # BLOCK-SEQUENCE-START (BLOCK-ENTRY block_node?)* BLOCK-END
290        token = self.scanner.get()
291        start_marker = token.start_marker
292        while self.scanner.check(BlockEntryToken):
293            token = self.scanner.get()
294            if not self.scanner.check(BlockEntryToken, BlockEndToken):
295                for event in self.parse_block_node():
296                    yield event
[43]297            else:
[51]298                yield self.process_empty_scalar(token.end_marker)
299        if not self.scanner.check(BlockEndToken):
300            token = self.scanner.peek()
301            raise ParserError("while scanning a block collection", start_marker,
302                    "expected <block end>, but found %r" % token.id, token.start_marker)
303        token = self.scanner.get()
304        yield CollectionEndEvent(token.start_marker, token.end_marker)
[43]305
306    def parse_indentless_sequence(self):
[51]307        # (BLOCK-ENTRY block_node?)+
308        while self.scanner.check(BlockEntryToken):
309            token = self.scanner.get()
310            if not self.scanner.check(BlockEntryToken,
311                    KeyToken, ValueToken, BlockEndToken):
312                for event in self.parse_block_node():
313                    yield event
[43]314            else:
[51]315                yield self.process_empty_scalar(token.end_marker)
316        token = self.scanner.peek()
317        yield CollectionEndEvent(token.start_marker, token.start_marker)
[43]318
319    def parse_block_mapping(self):
[51]320        # BLOCK-MAPPING_START
321        #   ((KEY block_node_or_indentless_sequence?)?
322        #   (VALUE block_node_or_indentless_sequence?)?)*
323        # BLOCK-END
324        token = self.scanner.get()
325        start_marker = token.start_marker
326        while self.scanner.check(KeyToken, ValueToken):
327            if self.scanner.check(KeyToken):
328                token = self.scanner.get()
329                if not self.scanner.check(KeyToken, ValueToken, BlockEndToken):
330                    for event in self.parse_block_node_or_indentless_sequence():
331                        yield event
332                else:
333                    yield self.process_empty_scalar(token.end_marker)
334            if self.scanner.check(ValueToken):
335                token = self.scanner.get()
336                if not self.scanner.check(KeyToken, ValueToken, BlockEndToken):
337                    for event in self.parse_block_node_or_indentless_sequence():
338                        yield event
339                else:
340                    yield self.process_empty_scalar(token.end_marker)
341            else:
342                token = self.scanner.peek()
343                yield self.process_empty_scalar(token.start_marker)
344        if not self.scanner.check(BlockEndToken):
345            token = self.scanner.peek()
346            raise ParserError("while scanning a block mapping", start_marker,
347                    "expected <block end>, but found %r" % token.id, token.start_marker)
348        token = self.scanner.get()
349        yield CollectionEndEvent(token.start_marker, token.end_marker)
[43]350
351    def parse_flow_sequence(self):
[51]352        # flow_sequence     ::= FLOW-SEQUENCE-START
353        #                       (flow_sequence_entry FLOW-ENTRY)*
354        #                       flow_sequence_entry?
355        #                       FLOW-SEQUENCE-END
356        # flow_sequence_entry   ::= flow_node | KEY flow_node? (VALUE flow_node?)?
357        #
358        # Note that while production rules for both flow_sequence_entry and
359        # flow_mapping_entry are equal, their interpretations are different.
360        # For `flow_sequence_entry`, the part `KEY flow_node? (VALUE flow_node?)?`
361        # generate an inline mapping (set syntax).
362        token = self.scanner.get()
363        start_marker = token.start_marker
364        while not self.scanner.check(FlowSequenceEndToken):
365            if self.scanner.check(KeyToken):
366                token = self.scanner.get()
367                yield MappingEvent(None, u'!',
368                        token.start_marker, token.end_marker)
369                if not self.scanner.check(ValueToken,
370                        FlowEntryToken, FlowSequenceEndToken):
371                    for event in self.parse_flow_node():
372                        yield event
373                else:
374                    yield self.process_empty_scalar(token.end_marker)
375                if self.scanner.check(ValueToken):
376                    token = self.scanner.get()
377                    if not self.scanner.check(FlowEntryToken, FlowSequenceEndToken):
378                        for event in self.parse_flow_node():
379                            yield event
380                    else:
381                        yield self.process_empty_scalar(token.end_marker)
382                else:
383                    token = self.scanner.peek()
384                    yield self.process_empty_scalar(token.start_marker)
385                token = self.scanner.peek()
386                yield CollectionEndEvent(token.start_marker, token.start_marker)
[43]387            else:
[51]388                for event in self.parse_flow_node():
389                    yield event
390            if not self.scanner.check(FlowEntryToken, FlowSequenceEndToken):
391                token = self.scanner.peek()
392                raise ParserError("while scanning a flow sequence", start_marker,
393                        "expected ',' or ']', but got %r" % token.id, token.start_marker)
394            if self.scanner.check(FlowEntryToken):
395                self.scanner.get()
396        if not self.scanner.check(FlowSequenceEndToken):
397            token = self.scanner.peek()
398            raise ParserError("while scanning a flow sequence", start_marker,
399                    "expected ']', but found %r" % token.id, token.start_marker)
400        token = self.scanner.get()
401        yield CollectionEndEvent(token.start_marker, token.end_marker)
[43]402
403    def parse_flow_mapping(self):
[51]404        # flow_mapping      ::= FLOW-MAPPING-START
405        #                       (flow_mapping_entry FLOW-ENTRY)*
406        #                       flow_mapping_entry?
407        #                       FLOW-MAPPING-END
408        # flow_mapping_entry    ::= flow_node | KEY flow_node? (VALUE flow_node?)?
409        token = self.scanner.get()
410        start_marker = token.start_marker
411        while not self.scanner.check(FlowMappingEndToken):
412            if self.scanner.check(KeyToken):
413                token = self.scanner.get()
414                if not self.scanner.check(ValueToken,
415                        FlowEntryToken, FlowMappingEndToken):
416                    for event in self.parse_flow_node():
417                        yield event
418                else:
419                    yield self.process_empty_scalar(token.end_marker)
420                if self.scanner.check(ValueToken):
421                    token = self.scanner.get()
422                    if not self.scanner.check(FlowEntryToken, FlowMappingEndToken):
423                        for event in self.parse_flow_node():
424                            yield event
425                    else:
426                        yield self.process_empty_scalar(token.end_marker)
427                else:
428                    token = self.scanner.peek()
429                    yield self.process_empty_scalar(token.start_marker)
[43]430            else:
[51]431                for event in self.parse_flow_node():
432                    yield event
433                yield self.process_empty_scalar(self.scanner.peek().start_marker)
434            if not self.scanner.check(FlowEntryToken, FlowMappingEndToken):
435                token = self.scanner.peek()
436                raise ParserError("while scanning a flow mapping", start_marker,
437                        "expected ',' or '}', but got %r" % token.id, token.start_marker)
438            if self.scanner.check(FlowEntryToken):
439                self.scanner.get()
440        if not self.scanner.check(FlowMappingEndToken):
441            token = self.scanner.peek()
442            raise ParserError("while scanning a flow mapping", start_marker,
443                    "expected '}', but found %r" % token.id, token.start_marker)
444        token = self.scanner.get()
445        yield CollectionEndEvent(token.start_marker, token.end_marker)
[43]446
[51]447    def process_empty_scalar(self, marker):
448        return ScalarEvent(None, None, u'', marker, marker)
[43]449
Note: See TracBrowser for help on using the repository browser.