| [45] | 1 | # This module contains abstractions for the input stream. You don't have to |
|---|
| 2 | # looks further, there are no pretty code. |
|---|
| 3 | # |
|---|
| 4 | # We define two classes here. |
|---|
| 5 | # |
|---|
| [116] | 6 | # Mark(source, line, column) |
|---|
| [45] | 7 | # It's just a record and its only use is producing nice error messages. |
|---|
| 8 | # Parser does not use it for any other purposes. |
|---|
| 9 | # |
|---|
| [46] | 10 | # Reader(source, data) |
|---|
| 11 | # Reader determines the encoding of `data` and converts it to unicode. |
|---|
| 12 | # Reader provides the following methods and attributes: |
|---|
| 13 | # reader.peek(length=1) - return the next `length` characters |
|---|
| 14 | # reader.forward(length=1) - move the current position to `length` characters. |
|---|
| 15 | # reader.index - the number of the current character. |
|---|
| 16 | # reader.line, stream.column - the line and the column of the current character. |
|---|
| [43] | 17 | |
|---|
| [52] | 18 | __all__ = ['Reader', 'ReaderError'] |
|---|
| [43] | 19 | |
|---|
| [116] | 20 | from error import YAMLError, Mark |
|---|
| [45] | 21 | |
|---|
| 22 | import codecs, re |
|---|
| 23 | |
|---|
| [46] | 24 | class ReaderError(YAMLError): |
|---|
| [45] | 25 | |
|---|
| [46] | 26 | def __init__(self, name, position, character, encoding, reason): |
|---|
| 27 | self.name = name |
|---|
| [45] | 28 | self.character = character |
|---|
| 29 | self.position = position |
|---|
| [46] | 30 | self.encoding = encoding |
|---|
| [45] | 31 | self.reason = reason |
|---|
| 32 | |
|---|
| 33 | def __str__(self): |
|---|
| 34 | if isinstance(self.character, str): |
|---|
| 35 | return "'%s' codec can't decode byte #x%02x: %s\n" \ |
|---|
| [47] | 36 | " in \"%s\", position %d" \ |
|---|
| [45] | 37 | % (self.encoding, ord(self.character), self.reason, |
|---|
| [46] | 38 | self.name, self.position) |
|---|
| [45] | 39 | else: |
|---|
| 40 | return "unacceptable character #x%04x: %s\n" \ |
|---|
| [47] | 41 | " in \"%s\", position %d" \ |
|---|
| [323] | 42 | % (self.character, self.reason, |
|---|
| [46] | 43 | self.name, self.position) |
|---|
| [45] | 44 | |
|---|
| [222] | 45 | class Reader(object): |
|---|
| [46] | 46 | # Reader: |
|---|
| [45] | 47 | # - determines the data encoding and converts it to unicode, |
|---|
| 48 | # - checks if characters are in allowed range, |
|---|
| 49 | # - adds '\0' to the end. |
|---|
| [43] | 50 | |
|---|
| [46] | 51 | # Reader accepts |
|---|
| 52 | # - a `str` object, |
|---|
| 53 | # - a `unicode` object, |
|---|
| 54 | # - a file-like object with its `read` method returning `str`, |
|---|
| 55 | # - a file-like object with its `read` method returning `unicode`. |
|---|
| 56 | |
|---|
| [45] | 57 | # Yeah, it's ugly and slow. |
|---|
| 58 | |
|---|
| [136] | 59 | def __init__(self, stream): |
|---|
| [46] | 60 | self.name = None |
|---|
| [45] | 61 | self.stream = None |
|---|
| 62 | self.stream_pointer = 0 |
|---|
| 63 | self.eof = True |
|---|
| 64 | self.buffer = u'' |
|---|
| 65 | self.pointer = 0 |
|---|
| 66 | self.raw_buffer = None |
|---|
| [46] | 67 | self.raw_decode = None |
|---|
| [130] | 68 | self.encoding = None |
|---|
| [43] | 69 | self.index = 0 |
|---|
| 70 | self.line = 0 |
|---|
| 71 | self.column = 0 |
|---|
| [136] | 72 | if isinstance(stream, unicode): |
|---|
| [46] | 73 | self.name = "<unicode string>" |
|---|
| [136] | 74 | self.check_printable(stream) |
|---|
| 75 | self.buffer = stream+u'\0' |
|---|
| 76 | elif isinstance(stream, str): |
|---|
| [46] | 77 | self.name = "<string>" |
|---|
| [136] | 78 | self.raw_buffer = stream |
|---|
| [45] | 79 | self.determine_encoding() |
|---|
| 80 | else: |
|---|
| [136] | 81 | self.stream = stream |
|---|
| 82 | self.name = getattr(stream, 'name', "<file>") |
|---|
| [45] | 83 | self.eof = False |
|---|
| 84 | self.raw_buffer = '' |
|---|
| 85 | self.determine_encoding() |
|---|
| [43] | 86 | |
|---|
| [48] | 87 | def peek(self, index=0): |
|---|
| [173] | 88 | try: |
|---|
| 89 | return self.buffer[self.pointer+index] |
|---|
| 90 | except IndexError: |
|---|
| [48] | 91 | self.update(index+1) |
|---|
| [173] | 92 | return self.buffer[self.pointer+index] |
|---|
| [48] | 93 | |
|---|
| 94 | def prefix(self, length=1): |
|---|
| [45] | 95 | if self.pointer+length >= len(self.buffer): |
|---|
| 96 | self.update(length) |
|---|
| 97 | return self.buffer[self.pointer:self.pointer+length] |
|---|
| [43] | 98 | |
|---|
| [45] | 99 | def forward(self, length=1): |
|---|
| 100 | if self.pointer+length+1 >= len(self.buffer): |
|---|
| 101 | self.update(length+1) |
|---|
| [173] | 102 | while length: |
|---|
| [45] | 103 | ch = self.buffer[self.pointer] |
|---|
| 104 | self.pointer += 1 |
|---|
| 105 | self.index += 1 |
|---|
| 106 | if ch in u'\n\x85\u2028\u2029' \ |
|---|
| [188] | 107 | or (ch == u'\r' and self.buffer[self.pointer] != u'\n'): |
|---|
| [43] | 108 | self.line += 1 |
|---|
| 109 | self.column = 0 |
|---|
| [45] | 110 | elif ch != u'\uFEFF': |
|---|
| [43] | 111 | self.column += 1 |
|---|
| [173] | 112 | length -= 1 |
|---|
| [43] | 113 | |
|---|
| [116] | 114 | def get_mark(self): |
|---|
| [45] | 115 | if self.stream is None: |
|---|
| [119] | 116 | return Mark(self.name, self.index, self.line, self.column, |
|---|
| [45] | 117 | self.buffer, self.pointer) |
|---|
| 118 | else: |
|---|
| [119] | 119 | return Mark(self.name, self.index, self.line, self.column, |
|---|
| 120 | None, None) |
|---|
| [43] | 121 | |
|---|
| [45] | 122 | def determine_encoding(self): |
|---|
| 123 | while not self.eof and len(self.raw_buffer) < 2: |
|---|
| 124 | self.update_raw() |
|---|
| [46] | 125 | if not isinstance(self.raw_buffer, unicode): |
|---|
| 126 | if self.raw_buffer.startswith(codecs.BOM_UTF16_LE): |
|---|
| [380] | 127 | self.raw_decode = codecs.utf_16_le_decode |
|---|
| [130] | 128 | self.encoding = 'utf-16-le' |
|---|
| [46] | 129 | elif self.raw_buffer.startswith(codecs.BOM_UTF16_BE): |
|---|
| [380] | 130 | self.raw_decode = codecs.utf_16_be_decode |
|---|
| [130] | 131 | self.encoding = 'utf-16-be' |
|---|
| [46] | 132 | else: |
|---|
| [380] | 133 | self.raw_decode = codecs.utf_8_decode |
|---|
| [130] | 134 | self.encoding = 'utf-8' |
|---|
| [45] | 135 | self.update(1) |
|---|
| 136 | |
|---|
| 137 | NON_PRINTABLE = re.compile(u'[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD]') |
|---|
| 138 | def check_printable(self, data): |
|---|
| 139 | match = self.NON_PRINTABLE.search(data) |
|---|
| 140 | if match: |
|---|
| 141 | character = match.group() |
|---|
| 142 | position = self.index+(len(self.buffer)-self.pointer)+match.start() |
|---|
| [323] | 143 | raise ReaderError(self.name, position, ord(character), |
|---|
| [46] | 144 | 'unicode', "special characters are not allowed") |
|---|
| [45] | 145 | |
|---|
| 146 | def update(self, length): |
|---|
| 147 | if self.raw_buffer is None: |
|---|
| 148 | return |
|---|
| 149 | self.buffer = self.buffer[self.pointer:] |
|---|
| 150 | self.pointer = 0 |
|---|
| 151 | while len(self.buffer) < length: |
|---|
| 152 | if not self.eof: |
|---|
| 153 | self.update_raw() |
|---|
| [46] | 154 | if self.raw_decode is not None: |
|---|
| 155 | try: |
|---|
| 156 | data, converted = self.raw_decode(self.raw_buffer, |
|---|
| 157 | 'strict', self.eof) |
|---|
| 158 | except UnicodeDecodeError, exc: |
|---|
| 159 | character = exc.object[exc.start] |
|---|
| 160 | if self.stream is not None: |
|---|
| 161 | position = self.stream_pointer-len(self.raw_buffer)+exc.start |
|---|
| 162 | else: |
|---|
| 163 | position = exc.start |
|---|
| 164 | raise ReaderError(self.name, position, character, |
|---|
| 165 | exc.encoding, exc.reason) |
|---|
| 166 | else: |
|---|
| 167 | data = self.raw_buffer |
|---|
| 168 | converted = len(data) |
|---|
| [45] | 169 | self.check_printable(data) |
|---|
| 170 | self.buffer += data |
|---|
| 171 | self.raw_buffer = self.raw_buffer[converted:] |
|---|
| 172 | if self.eof: |
|---|
| 173 | self.buffer += u'\0' |
|---|
| 174 | self.raw_buffer = None |
|---|
| 175 | break |
|---|
| 176 | |
|---|
| 177 | def update_raw(self, size=1024): |
|---|
| 178 | data = self.stream.read(size) |
|---|
| 179 | if data: |
|---|
| 180 | self.raw_buffer += data |
|---|
| 181 | self.stream_pointer += len(data) |
|---|
| 182 | else: |
|---|
| 183 | self.eof = True |
|---|
| 184 | |
|---|
| 185 | #try: |
|---|
| 186 | # import psyco |
|---|
| [46] | 187 | # psyco.bind(Reader) |
|---|
| [45] | 188 | #except ImportError: |
|---|
| 189 | # pass |
|---|
| 190 | |
|---|