source: pyyaml/trunk/lib/yaml/reader.py @ 222

Revision 222, 7.9 KB checked in by xi, 8 years ago (diff)

Subclass all base classes from object.

Hold references to the objects being represented (should fix #22).

The value of a mapping node is represented as a list of pairs (key, value)
now.

Sort dictionary items (fix #23).

Recursive structures are now loaded and dumped correctly, including complex
structures like recursive tuples (fix #5). Thanks Peter Murphy for the patches.
To make it possible, representer functions are allowed to be generators.
In this case, the first generated value is an object. Other values produced
by the representer are ignored.

Make Representer not try to guess !!pairs when a list is represented.
You need to construct a !!pairs node explicitly now.

Do not check for duplicate mapping keys as it didn't work correctly anyway.

RevLine 
[45]1# This module contains abstractions for the input stream. You don't have to
2# looks further, there are no pretty code.
3#
4# We define two classes here.
5#
[116]6#   Mark(source, line, column)
[45]7# It's just a record and its only use is producing nice error messages.
8# Parser does not use it for any other purposes.
9#
[46]10#   Reader(source, data)
11# Reader determines the encoding of `data` and converts it to unicode.
12# Reader provides the following methods and attributes:
13#   reader.peek(length=1) - return the next `length` characters
14#   reader.forward(length=1) - move the current position to `length` characters.
15#   reader.index - the number of the current character.
16#   reader.line, stream.column - the line and the column of the current character.
[43]17
[52]18__all__ = ['Reader', 'ReaderError']
[43]19
[116]20from error import YAMLError, Mark
[45]21
22import codecs, re
23
24# Unfortunately, codec functions in Python 2.3 does not support the `finish`
25# arguments, so we have to write our own wrappers.
26
27try:
28    codecs.utf_8_decode('', 'strict', False)
29    from codecs import utf_8_decode, utf_16_le_decode, utf_16_be_decode
30
31except TypeError:
32
33    def utf_16_le_decode(data, errors, finish=False):
34        if not finish and len(data) % 2 == 1:
35            data = data[:-1]
36        return codecs.utf_16_le_decode(data, errors)
37
38    def utf_16_be_decode(data, errors, finish=False):
39        if not finish and len(data) % 2 == 1:
40            data = data[:-1]
41        return codecs.utf_16_be_decode(data, errors)
42
43    def utf_8_decode(data, errors, finish=False):
44        if not finish:
45            # We are trying to remove a possible incomplete multibyte character
46            # from the suffix of the data.
47            # The first byte of a multi-byte sequence is in the range 0xc0 to 0xfd.
48            # All further bytes are in the range 0x80 to 0xbf.
49            # UTF-8 encoded UCS characters may be up to six bytes long.
50            count = 0
51            while count < 5 and count < len(data)   \
52                    and '\x80' <= data[-count-1] <= '\xBF':
53                count -= 1
54            if count < 5 and count < len(data)  \
55                    and '\xC0' <= data[-count-1] <= '\xFD':
56                data = data[:-count-1]
57        return codecs.utf_8_decode(data, errors)
58
[46]59class ReaderError(YAMLError):
[45]60
[46]61    def __init__(self, name, position, character, encoding, reason):
62        self.name = name
[45]63        self.character = character
64        self.position = position
[46]65        self.encoding = encoding
[45]66        self.reason = reason
67
68    def __str__(self):
69        if isinstance(self.character, str):
70            return "'%s' codec can't decode byte #x%02x: %s\n"  \
[47]71                    "  in \"%s\", position %d"    \
[45]72                    % (self.encoding, ord(self.character), self.reason,
[46]73                            self.name, self.position)
[45]74        else:
75            return "unacceptable character #x%04x: %s\n"    \
[47]76                    "  in \"%s\", position %d"    \
[45]77                    % (ord(self.character), self.reason,
[46]78                            self.name, self.position)
[45]79
[222]80class Reader(object):
[46]81    # Reader:
[45]82    # - determines the data encoding and converts it to unicode,
83    # - checks if characters are in allowed range,
84    # - adds '\0' to the end.
[43]85
[46]86    # Reader accepts
87    #  - a `str` object,
88    #  - a `unicode` object,
89    #  - a file-like object with its `read` method returning `str`,
90    #  - a file-like object with its `read` method returning `unicode`.
91
[45]92    # Yeah, it's ugly and slow.
93
[136]94    def __init__(self, stream):
[46]95        self.name = None
[45]96        self.stream = None
97        self.stream_pointer = 0
98        self.eof = True
99        self.buffer = u''
100        self.pointer = 0
101        self.raw_buffer = None
[46]102        self.raw_decode = None
[130]103        self.encoding = None
[43]104        self.index = 0
105        self.line = 0
106        self.column = 0
[136]107        if isinstance(stream, unicode):
[46]108            self.name = "<unicode string>"
[136]109            self.check_printable(stream)
110            self.buffer = stream+u'\0'
111        elif isinstance(stream, str):
[46]112            self.name = "<string>"
[136]113            self.raw_buffer = stream
[45]114            self.determine_encoding()
115        else:
[136]116            self.stream = stream
117            self.name = getattr(stream, 'name', "<file>")
[45]118            self.eof = False
119            self.raw_buffer = ''
120            self.determine_encoding()
[43]121
[48]122    def peek(self, index=0):
[173]123        try:
124            return self.buffer[self.pointer+index]
125        except IndexError:
[48]126            self.update(index+1)
[173]127            return self.buffer[self.pointer+index]
[48]128
129    def prefix(self, length=1):
[45]130        if self.pointer+length >= len(self.buffer):
131            self.update(length)
132        return self.buffer[self.pointer:self.pointer+length]
[43]133
[45]134    def forward(self, length=1):
135        if self.pointer+length+1 >= len(self.buffer):
136            self.update(length+1)
[173]137        while length:
[45]138            ch = self.buffer[self.pointer]
139            self.pointer += 1
140            self.index += 1
141            if ch in u'\n\x85\u2028\u2029'  \
[188]142                    or (ch == u'\r' and self.buffer[self.pointer] != u'\n'):
[43]143                self.line += 1
144                self.column = 0
[45]145            elif ch != u'\uFEFF':
[43]146                self.column += 1
[173]147            length -= 1
[43]148
[116]149    def get_mark(self):
[45]150        if self.stream is None:
[119]151            return Mark(self.name, self.index, self.line, self.column,
[45]152                    self.buffer, self.pointer)
153        else:
[119]154            return Mark(self.name, self.index, self.line, self.column,
155                    None, None)
[43]156
[45]157    def determine_encoding(self):
158        while not self.eof and len(self.raw_buffer) < 2:
159            self.update_raw()
[46]160        if not isinstance(self.raw_buffer, unicode):
161            if self.raw_buffer.startswith(codecs.BOM_UTF16_LE):
162                self.raw_decode = utf_16_le_decode
[130]163                self.encoding = 'utf-16-le'
[46]164            elif self.raw_buffer.startswith(codecs.BOM_UTF16_BE):
165                self.raw_decode = utf_16_be_decode
[130]166                self.encoding = 'utf-16-be'
[46]167            else:
168                self.raw_decode = utf_8_decode
[130]169                self.encoding = 'utf-8'
[45]170        self.update(1)
171
172    NON_PRINTABLE = re.compile(u'[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD]')
173    def check_printable(self, data):
174        match = self.NON_PRINTABLE.search(data)
175        if match:
176            character = match.group()
177            position = self.index+(len(self.buffer)-self.pointer)+match.start()
[46]178            raise ReaderError(self.name, position, character,
179                    'unicode', "special characters are not allowed")
[45]180
181    def update(self, length):
182        if self.raw_buffer is None:
183            return
184        self.buffer = self.buffer[self.pointer:]
185        self.pointer = 0
186        while len(self.buffer) < length:
187            if not self.eof:
188                self.update_raw()
[46]189            if self.raw_decode is not None:
190                try:
191                    data, converted = self.raw_decode(self.raw_buffer,
192                            'strict', self.eof)
193                except UnicodeDecodeError, exc:
194                    character = exc.object[exc.start]
195                    if self.stream is not None:
196                        position = self.stream_pointer-len(self.raw_buffer)+exc.start
197                    else:
198                        position = exc.start
199                    raise ReaderError(self.name, position, character,
200                            exc.encoding, exc.reason)
201            else:
202                data = self.raw_buffer
203                converted = len(data)
[45]204            self.check_printable(data)
205            self.buffer += data
206            self.raw_buffer = self.raw_buffer[converted:]
207            if self.eof:
208                self.buffer += u'\0'
209                self.raw_buffer = None
210                break
211
212    def update_raw(self, size=1024):
213        data = self.stream.read(size)
214        if data:
215            self.raw_buffer += data
216            self.stream_pointer += len(data)
217        else:
218            self.eof = True
219
220#try:
221#    import psyco
[46]222#    psyco.bind(Reader)
[45]223#except ImportError:
224#    pass
225
Note: See TracBrowser for help on using the repository browser.