Changeset 328 for pyyaml/trunk/lib3/yaml/reader.py
- Timestamp:
- 12/29/08 12:24:05 (4 years ago)
- Location:
- pyyaml/trunk/lib3
- Files:
-
- 1 edited
- 1 copied
-
. (copied) (copied from pyyaml/trunk/lib)
-
yaml/reader.py (modified) (10 diffs)
Legend:
- Unmodified
- Added
- Removed
-
pyyaml/trunk/lib3/yaml/reader.py
r323 r328 18 18 __all__ = ['Reader', 'ReaderError'] 19 19 20 from error import YAMLError, Mark20 from .error import YAMLError, Mark 21 21 22 22 import codecs, re 23 24 # Unfortunately, codec functions in Python 2.3 does not support the `finish`25 # arguments, so we have to write our own wrappers.26 27 try:28 codecs.utf_8_decode('', 'strict', False)29 from codecs import utf_8_decode, utf_16_le_decode, utf_16_be_decode30 31 except TypeError:32 33 def utf_16_le_decode(data, errors, finish=False):34 if not finish and len(data) % 2 == 1:35 data = data[:-1]36 return codecs.utf_16_le_decode(data, errors)37 38 def utf_16_be_decode(data, errors, finish=False):39 if not finish and len(data) % 2 == 1:40 data = data[:-1]41 return codecs.utf_16_be_decode(data, errors)42 43 def utf_8_decode(data, errors, finish=False):44 if not finish:45 # We are trying to remove a possible incomplete multibyte character46 # from the suffix of the data.47 # The first byte of a multi-byte sequence is in the range 0xc0 to 0xfd.48 # All further bytes are in the range 0x80 to 0xbf.49 # UTF-8 encoded UCS characters may be up to six bytes long.50 count = 051 while count < 5 and count < len(data) \52 and '\x80' <= data[-count-1] <= '\xBF':53 count -= 154 if count < 5 and count < len(data) \55 and '\xC0' <= data[-count-1] <= '\xFD':56 data = data[:-count-1]57 return codecs.utf_8_decode(data, errors)58 23 59 24 class ReaderError(YAMLError): … … 67 32 68 33 def __str__(self): 69 if isinstance(self.character, str):34 if isinstance(self.character, bytes): 70 35 return "'%s' codec can't decode byte #x%02x: %s\n" \ 71 36 " in \"%s\", position %d" \ … … 80 45 class Reader(object): 81 46 # Reader: 82 # - determines the data encoding and converts it to unicode,47 # - determines the data encoding and converts it to a unicode string, 83 48 # - checks if characters are in allowed range, 84 49 # - adds '\0' to the end. 85 50 86 51 # Reader accepts 52 # - a `bytes` object, 87 53 # - a `str` object, 88 # - a `unicode` object,89 54 # - a file-like object with its `read` method returning `str`, 90 55 # - a file-like object with its `read` method returning `unicode`. … … 97 62 self.stream_pointer = 0 98 63 self.eof = True 99 self.buffer = u''64 self.buffer = '' 100 65 self.pointer = 0 101 66 self.raw_buffer = None … … 105 70 self.line = 0 106 71 self.column = 0 107 if isinstance(stream, unicode):72 if isinstance(stream, str): 108 73 self.name = "<unicode string>" 109 74 self.check_printable(stream) 110 self.buffer = stream+ u'\0'111 elif isinstance(stream, str):112 self.name = "< string>"75 self.buffer = stream+'\0' 76 elif isinstance(stream, bytes): 77 self.name = "<byte string>" 113 78 self.raw_buffer = stream 114 79 self.determine_encoding() … … 117 82 self.name = getattr(stream, 'name', "<file>") 118 83 self.eof = False 119 self.raw_buffer = ''84 self.raw_buffer = None 120 85 self.determine_encoding() 121 86 … … 139 104 self.pointer += 1 140 105 self.index += 1 141 if ch in u'\n\x85\u2028\u2029' \142 or (ch == u'\r' and self.buffer[self.pointer] != u'\n'):106 if ch in '\n\x85\u2028\u2029' \ 107 or (ch == '\r' and self.buffer[self.pointer] != '\n'): 143 108 self.line += 1 144 109 self.column = 0 145 elif ch != u'\uFEFF':110 elif ch != '\uFEFF': 146 111 self.column += 1 147 112 length -= 1 … … 156 121 157 122 def determine_encoding(self): 158 while not self.eof and len(self.raw_buffer) < 2:123 while not self.eof and (self.raw_buffer is None or len(self.raw_buffer) < 2): 159 124 self.update_raw() 160 if not isinstance(self.raw_buffer, unicode):125 if isinstance(self.raw_buffer, bytes): 161 126 if self.raw_buffer.startswith(codecs.BOM_UTF16_LE): 162 self.raw_decode = utf_16_le_decode127 self.raw_decode = codecs.utf_16_le_decode 163 128 self.encoding = 'utf-16-le' 164 129 elif self.raw_buffer.startswith(codecs.BOM_UTF16_BE): 165 self.raw_decode = utf_16_be_decode130 self.raw_decode = codecs.utf_16_be_decode 166 131 self.encoding = 'utf-16-be' 167 132 else: 168 self.raw_decode = utf_8_decode133 self.raw_decode = codecs.utf_8_decode 169 134 self.encoding = 'utf-8' 170 135 self.update(1) 171 136 172 NON_PRINTABLE = re.compile( u'[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD]')137 NON_PRINTABLE = re.compile('[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD]') 173 138 def check_printable(self, data): 174 139 match = self.NON_PRINTABLE.search(data) … … 191 156 data, converted = self.raw_decode(self.raw_buffer, 192 157 'strict', self.eof) 193 except UnicodeDecodeError ,exc:158 except UnicodeDecodeError as exc: 194 159 character = exc.object[exc.start] 195 160 if self.stream is not None: … … 206 171 self.raw_buffer = self.raw_buffer[converted:] 207 172 if self.eof: 208 self.buffer += u'\0'173 self.buffer += '\0' 209 174 self.raw_buffer = None 210 175 break 211 176 212 def update_raw(self, size= 1024):177 def update_raw(self, size=4096): 213 178 data = self.stream.read(size) 214 if data: 179 if self.raw_buffer is None: 180 self.raw_buffer = data 181 else: 215 182 self.raw_buffer += data 216 self.stream_pointer += len(data)217 else:183 self.stream_pointer += len(data) 184 if not data: 218 185 self.eof = True 219 186
Note: See TracChangeset
for help on using the changeset viewer.
