Changeset 45 for branches/pyyaml3000/lib/yaml/stream.py
- Timestamp:
- 02/15/06 16:57:37 (7 years ago)
- File:
-
- 1 edited
-
branches/pyyaml3000/lib/yaml/stream.py (modified) (1 diff)
Legend:
- Unmodified
- Added
- Removed
-
branches/pyyaml3000/lib/yaml/stream.py
r44 r45 1 2 from marker import Marker 1 # This module contains abstractions for the input stream. You don't have to 2 # looks further, there are no pretty code. 3 # 4 # We define two classes here. 5 # 6 # Marker(source, line, column) 7 # It's just a record and its only use is producing nice error messages. 8 # Parser does not use it for any other purposes. 9 # 10 # Stream(source, data) 11 # Stream determines the encoding of `data` and converts it to unicode. 12 # Stream provides the following methods and attributes: 13 # stream.peek(length=1) - return the next `length` characters 14 # stream.forward(length=1) - move the current position to `length` characters. 15 # stream.index - the number of the current character. 16 # stream.line, stream.column - the line and the column of the current character. 17 18 19 from error import YAMLError 20 21 import codecs, re 22 23 # Unfortunately, codec functions in Python 2.3 does not support the `finish` 24 # arguments, so we have to write our own wrappers. 25 26 try: 27 codecs.utf_8_decode('', 'strict', False) 28 from codecs import utf_8_decode, utf_16_le_decode, utf_16_be_decode 29 30 except TypeError: 31 32 def utf_16_le_decode(data, errors, finish=False): 33 if not finish and len(data) % 2 == 1: 34 data = data[:-1] 35 return codecs.utf_16_le_decode(data, errors) 36 37 def utf_16_be_decode(data, errors, finish=False): 38 if not finish and len(data) % 2 == 1: 39 data = data[:-1] 40 return codecs.utf_16_be_decode(data, errors) 41 42 def utf_8_decode(data, errors, finish=False): 43 if not finish: 44 # We are trying to remove a possible incomplete multibyte character 45 # from the suffix of the data. 46 # The first byte of a multi-byte sequence is in the range 0xc0 to 0xfd. 47 # All further bytes are in the range 0x80 to 0xbf. 48 # UTF-8 encoded UCS characters may be up to six bytes long. 49 count = 0 50 while count < 5 and count < len(data) \ 51 and '\x80' <= data[-count-1] <= '\xBF': 52 count -= 1 53 if count < 5 and count < len(data) \ 54 and '\xC0' <= data[-count-1] <= '\xFD': 55 data = data[:-count-1] 56 return codecs.utf_8_decode(data, errors) 57 58 class Marker: 59 60 def __init__(self, source, line, column, buffer, pointer): 61 self.source = source 62 self.line = line 63 self.column = column 64 self.buffer = buffer 65 self.pointer = pointer 66 67 def get_snippet(self, max_length=79): 68 if self.buffer is None: 69 return None 70 head = '' 71 start = self.pointer 72 while start > 0 and self.buffer[start-1] not in u'\0\r\n\x85\u2028\u2029': 73 start -= 1 74 if self.pointer-start > max_length/2-1: 75 head = ' ... ' 76 start += 5 77 break 78 tail = '' 79 end = self.pointer 80 while end < len(self.buffer) and self.buffer[end] not in u'\0\r\n\x85\u2028\u2029': 81 end += 1 82 if end-self.pointer > max_length/2-1: 83 tail = ' ... ' 84 end -= 5 85 break 86 snippet = self.buffer[start:end].encode('utf-8') 87 return head + snippet + tail + '\n' \ 88 + ' '*(self.pointer-start+len(head)) + '^' + '\n' 89 90 class StreamError(YAMLError): 91 92 def __init__(self, source, encoding, character, position, reason): 93 self.source = source 94 self.encoding = encoding 95 self.character = character 96 self.position = position 97 self.reason = reason 98 99 def __str__(self): 100 if isinstance(self.character, str): 101 return "'%s' codec can't decode byte #x%02x: %s\n" \ 102 "\tin file '%s', position %d." \ 103 % (self.encoding, ord(self.character), self.reason, 104 self.source, self.position) 105 else: 106 return "unacceptable character #x%04x: %s\n" \ 107 "\tin file '%s', position %d." \ 108 % (ord(self.character), self.reason, 109 self.source, self.position) 3 110 4 111 class Stream: 112 # Stream: 113 # - determines the data encoding and converts it to unicode, 114 # - checks if characters are in allowed range, 115 # - adds '\0' to the end. 116 117 # Yeah, it's ugly and slow. 5 118 6 119 def __init__(self, source, data): 7 120 self.source = source 8 self.data = unicode(data, 'utf-8')+u'\0' 121 self.stream = None 122 self.stream_pointer = 0 123 self.eof = True 124 self.buffer = u'' 125 self.pointer = 0 126 self.raw_buffer = None 127 self.raw_decoder = None 9 128 self.index = 0 10 129 self.line = 0 11 130 self.column = 0 12 13 def peek(self, k=1): 14 return self.data[self.index:self.index+k] 15 16 def read(self, k=1): 17 value = self.data[self.index:self.index+k] 18 for i in range(k): 19 if self.index >= len(self.data): 20 break 21 if self.data[self.index] in u'\r\n\x85\u2028\u2029': 131 if isinstance(data, unicode): 132 self.check_printable(data) 133 self.buffer = data+u'\0' 134 elif isinstance(data, str): 135 self.raw_buffer = data 136 self.determine_encoding() 137 else: 138 self.stream = data 139 self.eof = False 140 self.raw_buffer = '' 141 self.determine_encoding() 142 143 def peek(self, length=1): 144 if self.pointer+length >= len(self.buffer): 145 self.update(length) 146 return self.buffer[self.pointer:self.pointer+length] 147 148 def forward(self, length=1): 149 if self.pointer+length+1 >= len(self.buffer): 150 self.update(length+1) 151 for k in range(length): 152 ch = self.buffer[self.pointer] 153 self.pointer += 1 154 self.index += 1 155 if ch in u'\n\x85\u2028\u2029' \ 156 or (ch == u'\r' and self.buffer[self.pointer+1] != u'\n'): 22 157 self.line += 1 23 158 self.column = 0 24 el se:159 elif ch != u'\uFEFF': 25 160 self.column += 1 26 self.index += 127 return value28 161 29 162 def get_marker(self): 30 return Marker(self.source, self.data, self.index, self.line, self.column) 31 163 if self.stream is None: 164 return Marker(self.source, self.line, self.column, 165 self.buffer, self.pointer) 166 else: 167 return Marker(self.source, self.line, self.column, None, None) 168 169 def determine_encoding(self): 170 while not self.eof and len(self.raw_buffer) < 2: 171 self.update_raw() 172 if self.raw_buffer.startswith(codecs.BOM_UTF16_LE): 173 self.raw_decode = utf_16_le_decode 174 elif self.raw_buffer.startswith(codecs.BOM_UTF16_BE): 175 self.raw_decode = utf_16_be_decode 176 else: 177 self.raw_decode = utf_8_decode 178 self.update(1) 179 180 NON_PRINTABLE = re.compile(u'[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD]') 181 def check_printable(self, data): 182 match = self.NON_PRINTABLE.search(data) 183 if match: 184 character = match.group() 185 position = self.index+(len(self.buffer)-self.pointer)+match.start() 186 raise StreamError(self.source, 'unicode', character, position, 187 "control characters are not allowed") 188 189 def update(self, length): 190 if self.raw_buffer is None: 191 return 192 self.buffer = self.buffer[self.pointer:] 193 self.pointer = 0 194 while len(self.buffer) < length: 195 if not self.eof: 196 self.update_raw() 197 try: 198 data, converted = self.raw_decode(self.raw_buffer, 199 'strict', self.eof) 200 except UnicodeDecodeError, exc: 201 character = exc.object[exc.start] 202 if self.stream is not None: 203 position = self.stream_pointer-len(self.raw_buffer)+exc.start 204 else: 205 position = exc.start 206 raise StreamError(self.source, exc.encoding, 207 character, position, exc.reason) 208 self.check_printable(data) 209 self.buffer += data 210 self.raw_buffer = self.raw_buffer[converted:] 211 if self.eof: 212 self.buffer += u'\0' 213 self.raw_buffer = None 214 break 215 216 def update_raw(self, size=1024): 217 data = self.stream.read(size) 218 if data: 219 self.raw_buffer += data 220 self.stream_pointer += len(data) 221 else: 222 self.eof = True 223 224 #try: 225 # import psyco 226 # psyco.bind(Stream) 227 #except ImportError: 228 # pass 229
Note: See TracChangeset
for help on using the changeset viewer.
