Index: /branches/pyyaml3000/lib/yaml/error.py
===================================================================
--- /branches/pyyaml3000/lib/yaml/error.py	(revision 45)
+++ /branches/pyyaml3000/lib/yaml/error.py	(revision 45)
@@ -0,0 +1,4 @@
+
+class YAMLError(Exception):
+    pass
+
Index: /branches/pyyaml3000/lib/yaml/scanner.py
===================================================================
--- /branches/pyyaml3000/lib/yaml/scanner.py	(revision 44)
+++ /branches/pyyaml3000/lib/yaml/scanner.py	(revision 45)
@@ -125,5 +125,5 @@
         # Stream supports the following methods
         #   self.stream.peek(k=1)   # peek the next k characters
-        #   self.stream.read(k=1)   # read the next k characters and move the
+        #   self.stream.forward(k=1)   # read the next k characters and move the
         #                           # pointer
         self.stream = Stream(source, data)
@@ -443,5 +443,5 @@
         # Add DOCUMENT-START or DOCUMENT-END.
         start_marker = self.stream.get_marker()
-        self.stream.read(3)
+        self.stream.forward(3)
         end_marker = self.stream.get_marker()
         self.tokens.append(TokenClass(start_marker, end_marker))
@@ -466,5 +466,5 @@
         # Add FLOW-SEQUENCE-START or FLOW-MAPPING-START.
         start_marker = self.stream.get_marker()
-        self.stream.read()
+        self.stream.forward()
         end_marker = self.stream.get_marker()
         self.tokens.append(TokenClass(start_marker, end_marker))
@@ -489,5 +489,5 @@
         # Add FLOW-SEQUENCE-END or FLOW-MAPPING-END.
         start_marker = self.stream.get_marker()
-        self.stream.read()
+        self.stream.forward()
         end_marker = self.stream.get_marker()
         self.tokens.append(TokenClass(start_marker, end_marker))
@@ -515,5 +515,5 @@
         # Add ENTRY.
         start_marker = self.stream.get_marker()
-        self.stream.read()
+        self.stream.forward()
         end_marker = self.stream.get_marker()
         self.tokens.append(EntryToken(start_marker, end_marker))
@@ -541,5 +541,5 @@
         # Add KEY.
         start_marker = self.stream.get_marker()
-        self.stream.read()
+        self.stream.forward()
         end_marker = self.stream.get_marker()
         self.tokens.append(KeyToken(start_marker, end_marker))
@@ -577,5 +577,5 @@
         # Add VALUE.
         start_marker = self.stream.get_marker()
-        self.stream.read()
+        self.stream.forward()
         end_marker = self.stream.get_marker()
         self.tokens.append(ValueToken(start_marker, end_marker))
@@ -728,10 +728,10 @@
         while not found:
             while self.stream.peek() == u' ':
-                self.stream.read()
+                self.stream.forward()
             if self.stream.peek() == u'#':
                 while self.stream.peek() not in u'\r\n':
-                    self.stream.read()
+                    self.stream.forward()
             if self.stream.peek() in u'\r\n':
-                self.stream.read()
+                self.stream.forward()
                 if not self.flow_level:
                     self.allow_simple_key = True
@@ -748,11 +748,11 @@
             self.tokens.append(ReservedDirectiveToken('', marker, marker))
         while self.stream.peek() not in u'\0\r\n':
-            self.stream.read()
-        self.stream.read()
+            self.stream.forward()
+        self.stream.forward()
 
     def scan_anchor(self, TokenClass):
         start_marker = self.stream.get_marker()
         while self.stream.peek() not in u'\0 \t\r\n,:':
-            self.stream.read()
+            self.stream.forward()
         end_marker = self.stream.get_marker()
         self.tokens.append(TokenClass('', start_marker, end_marker))
@@ -761,5 +761,5 @@
         start_marker = self.stream.get_marker()
         while self.stream.peek() not in u'\0 \t\r\n':
-            self.stream.read()
+            self.stream.forward()
         end_marker = self.stream.get_marker()
         self.tokens.append(TagToken('', start_marker, end_marker))
@@ -772,10 +772,10 @@
         while True:
             while self.stream.peek() and self.stream.peek() and self.stream.peek() not in u'\0\r\n\x85\u2028\u2029':
-                self.stream.read()
+                self.stream.forward()
             if self.stream.peek() != u'\0':
-                self.stream.read()
+                self.stream.forward()
             count = 0
             while count < indent and self.stream.peek() == u' ':
-                self.stream.read()
+                self.stream.forward()
                 count += 1
             if count < indent and self.stream.peek() not in u'#\r\n\x85\u2028\u2029':
@@ -785,13 +785,14 @@
     def scan_flow_scalar(self, double):
         marker = self.stream.get_marker()
-        quote = self.stream.read()
+        quote = self.stream.peek()
+        self.stream.forward()
         while self.stream.peek() != quote:
             if double and self.stream.peek() == u'\\':
-                self.stream.read(2)
+                self.stream.forward(2)
             elif not double and self.stream.peek(3)[1:] == u'\'\'':
-                self.stream.read(3)
+                self.stream.forward(3)
             else:
-                self.stream.read(1)
-        self.stream.read(1)
+                self.stream.forward(1)
+        self.stream.forward(1)
         self.tokens.append(ScalarToken('', False, marker, marker))
 
@@ -804,5 +805,5 @@
         while True:
             while self.stream.peek() == u' ':
-                self.stream.read()
+                self.stream.forward()
                 space = True
             while self.stream.peek() not in u'\0\r\n?:,[]{}#'   \
@@ -811,15 +812,15 @@
                     or (not self.flow_level and self.stream.peek() == ':' and self.stream.peek(2)[1] not in u' \0\r\n'):
                 space = self.stream.peek() not in u' \t'
-                self.stream.read()
+                self.stream.forward()
                 self.allow_simple_key = False
             if self.stream.peek() not in u'\r\n':
                 break
             while self.stream.peek() in u'\r\n':
-                self.stream.read()
+                self.stream.forward()
                 if not self.flow_level:
                     self.allow_simple_key = True
             count = 0
             while self.stream.peek() == u' ' and count < indent:
-                self.stream.read()
+                self.stream.forward()
                 count += 1
             if count < indent:
@@ -834,2 +835,8 @@
         raise ScannerError(message)
 
+#try:
+#    import psyco
+#    psyco.bind(Scanner)
+#except ImportError:
+#    pass
+
Index: /branches/pyyaml3000/lib/yaml/stream.py
===================================================================
--- /branches/pyyaml3000/lib/yaml/stream.py	(revision 44)
+++ /branches/pyyaml3000/lib/yaml/stream.py	(revision 45)
@@ -1,31 +1,229 @@
-
-from marker import Marker
+# This module contains abstractions for the input stream. You don't have to
+# looks further, there are no pretty code.
+#
+# We define two classes here.
+#
+#   Marker(source, line, column)
+# It's just a record and its only use is producing nice error messages.
+# Parser does not use it for any other purposes.
+#
+#   Stream(source, data)
+# Stream determines the encoding of `data` and converts it to unicode.
+# Stream provides the following methods and attributes:
+#   stream.peek(length=1) - return the next `length` characters
+#   stream.forward(length=1) - move the current position to `length` characters.
+#   stream.index - the number of the current character.
+#   stream.line, stream.column - the line and the column of the current character.
+
+
+from error import YAMLError
+
+import codecs, re
+
+# Unfortunately, codec functions in Python 2.3 does not support the `finish`
+# arguments, so we have to write our own wrappers.
+
+try:
+    codecs.utf_8_decode('', 'strict', False)
+    from codecs import utf_8_decode, utf_16_le_decode, utf_16_be_decode
+
+except TypeError:
+
+    def utf_16_le_decode(data, errors, finish=False):
+        if not finish and len(data) % 2 == 1:
+            data = data[:-1]
+        return codecs.utf_16_le_decode(data, errors)
+
+    def utf_16_be_decode(data, errors, finish=False):
+        if not finish and len(data) % 2 == 1:
+            data = data[:-1]
+        return codecs.utf_16_be_decode(data, errors)
+
+    def utf_8_decode(data, errors, finish=False):
+        if not finish:
+            # We are trying to remove a possible incomplete multibyte character
+            # from the suffix of the data.
+            # The first byte of a multi-byte sequence is in the range 0xc0 to 0xfd.
+            # All further bytes are in the range 0x80 to 0xbf.
+            # UTF-8 encoded UCS characters may be up to six bytes long.
+            count = 0
+            while count < 5 and count < len(data)   \
+                    and '\x80' <= data[-count-1] <= '\xBF':
+                count -= 1
+            if count < 5 and count < len(data)  \
+                    and '\xC0' <= data[-count-1] <= '\xFD':
+                data = data[:-count-1]
+        return codecs.utf_8_decode(data, errors)
+
+class Marker:
+
+    def __init__(self, source, line, column, buffer, pointer):
+        self.source = source
+        self.line = line
+        self.column = column
+        self.buffer = buffer
+        self.pointer = pointer
+
+    def get_snippet(self, max_length=79):
+        if self.buffer is None:
+            return None
+        head = ''
+        start = self.pointer
+        while start > 0 and self.buffer[start-1] not in u'\0\r\n\x85\u2028\u2029':
+            start -= 1
+            if self.pointer-start > max_length/2-1:
+                head = ' ... '
+                start += 5
+                break
+        tail = ''
+        end = self.pointer
+        while end < len(self.buffer) and self.buffer[end] not in u'\0\r\n\x85\u2028\u2029':
+            end += 1
+            if end-self.pointer > max_length/2-1:
+                tail = ' ... '
+                end -= 5
+                break
+        snippet = self.buffer[start:end].encode('utf-8')
+        return head + snippet + tail + '\n'  \
+                + ' '*(self.pointer-start+len(head)) + '^' + '\n'
+
+class StreamError(YAMLError):
+
+    def __init__(self, source, encoding, character, position, reason):
+        self.source = source
+        self.encoding = encoding
+        self.character = character
+        self.position = position
+        self.reason = reason
+
+    def __str__(self):
+        if isinstance(self.character, str):
+            return "'%s' codec can't decode byte #x%02x: %s\n"  \
+                    "\tin file '%s', position %d."   \
+                    % (self.encoding, ord(self.character), self.reason,
+                            self.source, self.position)
+        else:
+            return "unacceptable character #x%04x: %s\n"    \
+                    "\tin file '%s', position %d."   \
+                    % (ord(self.character), self.reason,
+                            self.source, self.position)
 
 class Stream:
+    # Stream:
+    # - determines the data encoding and converts it to unicode,
+    # - checks if characters are in allowed range,
+    # - adds '\0' to the end.
+
+    # Yeah, it's ugly and slow.
 
     def __init__(self, source, data):
         self.source = source
-        self.data = unicode(data, 'utf-8')+u'\0'
+        self.stream = None
+        self.stream_pointer = 0
+        self.eof = True
+        self.buffer = u''
+        self.pointer = 0
+        self.raw_buffer = None
+        self.raw_decoder = None
         self.index = 0
         self.line = 0
         self.column = 0
-
-    def peek(self, k=1):
-        return self.data[self.index:self.index+k]
-
-    def read(self, k=1):
-        value = self.data[self.index:self.index+k]
-        for i in range(k):
-            if self.index >= len(self.data):
-                break
-            if self.data[self.index] in u'\r\n\x85\u2028\u2029':
+        if isinstance(data, unicode):
+            self.check_printable(data)
+            self.buffer = data+u'\0'
+        elif isinstance(data, str):
+            self.raw_buffer = data
+            self.determine_encoding()
+        else:
+            self.stream = data
+            self.eof = False
+            self.raw_buffer = ''
+            self.determine_encoding()
+
+    def peek(self, length=1):
+        if self.pointer+length >= len(self.buffer):
+            self.update(length)
+        return self.buffer[self.pointer:self.pointer+length]
+
+    def forward(self, length=1):
+        if self.pointer+length+1 >= len(self.buffer):
+            self.update(length+1)
+        for k in range(length):
+            ch = self.buffer[self.pointer]
+            self.pointer += 1
+            self.index += 1
+            if ch in u'\n\x85\u2028\u2029'  \
+                    or (ch == u'\r' and self.buffer[self.pointer+1] != u'\n'):
                 self.line += 1
                 self.column = 0
-            else:
+            elif ch != u'\uFEFF':
                 self.column += 1
-            self.index += 1
-        return value
 
     def get_marker(self):
-        return Marker(self.source, self.data, self.index, self.line, self.column)
-
+        if self.stream is None:
+            return Marker(self.source, self.line, self.column,
+                    self.buffer, self.pointer)
+        else:
+            return Marker(self.source, self.line, self.column, None, None)
+
+    def determine_encoding(self):
+        while not self.eof and len(self.raw_buffer) < 2:
+            self.update_raw()
+        if self.raw_buffer.startswith(codecs.BOM_UTF16_LE):
+            self.raw_decode = utf_16_le_decode
+        elif self.raw_buffer.startswith(codecs.BOM_UTF16_BE):
+            self.raw_decode = utf_16_be_decode
+        else:
+            self.raw_decode = utf_8_decode
+        self.update(1)
+
+    NON_PRINTABLE = re.compile(u'[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD]')
+    def check_printable(self, data):
+        match = self.NON_PRINTABLE.search(data)
+        if match:
+            character = match.group()
+            position = self.index+(len(self.buffer)-self.pointer)+match.start()
+            raise StreamError(self.source, 'unicode', character, position,
+                    "control characters are not allowed")
+
+    def update(self, length):
+        if self.raw_buffer is None:
+            return
+        self.buffer = self.buffer[self.pointer:]
+        self.pointer = 0
+        while len(self.buffer) < length:
+            if not self.eof:
+                self.update_raw()
+            try:
+                data, converted = self.raw_decode(self.raw_buffer,
+                        'strict', self.eof)
+            except UnicodeDecodeError, exc:
+                character = exc.object[exc.start]
+                if self.stream is not None:
+                    position = self.stream_pointer-len(self.raw_buffer)+exc.start
+                else:
+                    position = exc.start
+                raise StreamError(self.source, exc.encoding,
+                        character, position, exc.reason)
+            self.check_printable(data)
+            self.buffer += data
+            self.raw_buffer = self.raw_buffer[converted:]
+            if self.eof:
+                self.buffer += u'\0'
+                self.raw_buffer = None
+                break
+
+    def update_raw(self, size=1024):
+        data = self.stream.read(size)
+        if data:
+            self.raw_buffer += data
+            self.stream_pointer += len(data)
+        else:
+            self.eof = True
+
+#try:
+#    import psyco
+#    psyco.bind(Stream)
+#except ImportError:
+#    pass
+
Index: /branches/pyyaml3000/tests/test_marker.py
===================================================================
--- /branches/pyyaml3000/tests/test_marker.py	(revision 39)
+++ /branches/pyyaml3000/tests/test_marker.py	(revision 45)
@@ -2,5 +2,5 @@
 import test_appliance
 
-from yaml.marker import Marker
+from yaml.stream import Marker
 
 class TestMarker(test_appliance.TestAppliance):
@@ -19,16 +19,15 @@
                     column += 1
                 index += 1
-            for str_type in [str, unicode]:
-                marker = Marker(test_name, str_type(input), index, line, column)
-                snippet = marker.get_snippet()
-                #print "INPUT:"
-                #print input
-                #print "SNIPPET:"
-                #print snippet
-                self.failUnless(isinstance(snippet, str))
-                self.failUnlessEqual(snippet.count('\n'), 2)
-                data, pointer, dummy = snippet.split('\n')
-                self.failUnless(len(data) < 80)
-                self.failUnlessEqual(data[len(pointer)-1], '*')
+            marker = Marker(test_name, line, column, unicode(input), index)
+            snippet = marker.get_snippet()
+            #print "INPUT:"
+            #print input
+            #print "SNIPPET:"
+            #print snippet
+            self.failUnless(isinstance(snippet, str))
+            self.failUnlessEqual(snippet.count('\n'), 2)
+            data, pointer, dummy = snippet.split('\n')
+            self.failUnless(len(data) < 80)
+            self.failUnlessEqual(data[len(pointer)-1], '*')
 
 TestMarker.add_tests('testMarkers', '.markers')
Index: /branches/pyyaml3000/tests/test_yaml.py
===================================================================
--- /branches/pyyaml3000/tests/test_yaml.py	(revision 44)
+++ /branches/pyyaml3000/tests/test_yaml.py	(revision 45)
@@ -3,4 +3,5 @@
 
 from test_marker import *
+from test_stream import *
 from test_canonical import *
 from test_tokens import *
Index: /branches/pyyaml3000/tests/data/invalid-character.stream-error
===================================================================
--- /branches/pyyaml3000/tests/data/invalid-character.stream-error	(revision 45)
+++ /branches/pyyaml3000/tests/data/invalid-character.stream-error	(revision 45)
@@ -0,0 +1,18 @@
+-------------------------------------------------------------------------------------------------------------------------------
+-------------------------------------------------------------------------------------------------------------------------------
+-------------------------------------------------------------------------------------------------------------------------------
+-------------------------------------------------------------------------------------------------------------------------------
+-------------------------------------------------------------------------------------------------------------------------------
+-------------------------------------------------------------------------------------------------------------------------------
+-------------------------------------------------------------------------------------------------------------------------------
+-------------------------------------------------------------------------------------------------------------------------------
+-------------------------------------------------------------------------------------------------------------------------------
+-------------------------------------------------------------------------------------------------------------------------------
+-------------------------------------------------------------------------------------------------------------------------------
+-------------------------------------------------------------------------------------------------------------------------------
+-------------------------------------------------------------------------------------------------------------------------------
+-------------------------------------------------------------------------------------------------------------------------------
+-------------------------------------------------------------------------------------------------------------------------------
+-------------------------------------------------------------------------------------------------------------------------------
+Control character ('\x0'):   <--
+-------------------------------------------------------------------------------------------------------------------------------
Index: /branches/pyyaml3000/tests/data/invalid-utf8-byte.stream-error
===================================================================
--- /branches/pyyaml3000/tests/data/invalid-utf8-byte.stream-error	(revision 45)
+++ /branches/pyyaml3000/tests/data/invalid-utf8-byte.stream-error	(revision 45)
@@ -0,0 +1,18 @@
+-------------------------------------------------------------------------------------------------------------------------------
+-------------------------------------------------------------------------------------------------------------------------------
+-------------------------------------------------------------------------------------------------------------------------------
+-------------------------------------------------------------------------------------------------------------------------------
+-------------------------------------------------------------------------------------------------------------------------------
+-------------------------------------------------------------------------------------------------------------------------------
+-------------------------------------------------------------------------------------------------------------------------------
+-------------------------------------------------------------------------------------------------------------------------------
+-------------------------------------------------------------------------------------------------------------------------------
+-------------------------------------------------------------------------------------------------------------------------------
+-------------------------------------------------------------------------------------------------------------------------------
+-------------------------------------------------------------------------------------------------------------------------------
+-------------------------------------------------------------------------------------------------------------------------------
+-------------------------------------------------------------------------------------------------------------------------------
+-------------------------------------------------------------------------------------------------------------------------------
+-------------------------------------------------------------------------------------------------------------------------------
+Invalid byte ('\xFF'): ÿ <--
+-------------------------------------------------------------------------------------------------------------------------------
Index: /branches/pyyaml3000/tests/test_stream.py
===================================================================
--- /branches/pyyaml3000/tests/test_stream.py	(revision 45)
+++ /branches/pyyaml3000/tests/test_stream.py	(revision 45)
@@ -0,0 +1,31 @@
+
+import test_appliance
+from yaml.stream import Stream, StreamError
+
+class TestStreamErrors(test_appliance.TestAppliance):
+
+    def _testStreamUnicodeErrors(self, test_name, stream_filename):
+        try:
+            data = unicode(file(stream_filename, 'rb').read(), 'utf-8')
+        except:
+            return
+        self.failUnlessRaises(StreamError, lambda: self._load(stream_filename, data))
+
+    def _testStreamStringErrors(self, test_name, stream_filename):
+        data = file(stream_filename, 'rb').read()
+        self.failUnlessRaises(StreamError, lambda: self._load(stream_filename, data))
+
+    def _testStreamFileErrors(self, test_name, stream_filename):
+        data = file(stream_filename, 'rb')
+        self.failUnlessRaises(StreamError, lambda: self._load(stream_filename, data))
+
+    def _load(self, stream_filename, data):
+        stream = Stream(stream_filename, data)
+        while stream.peek() != u'\0':
+            stream.forward()
+
+TestStreamErrors.add_tests('testStreamUnicodeErrors', '.stream-error')
+TestStreamErrors.add_tests('testStreamStringErrors', '.stream-error')
+TestStreamErrors.add_tests('testStreamFileErrors', '.stream-error')
+
+
Index: /branches/pyyaml3000/tests/test_appliance.py
===================================================================
--- /branches/pyyaml3000/tests/test_appliance.py	(revision 44)
+++ /branches/pyyaml3000/tests/test_appliance.py	(revision 45)
@@ -6,13 +6,13 @@
     DATA = 'tests/data'
 
-    tests = {}
+    all_tests = {}
     for filename in os.listdir(DATA):
         if os.path.isfile(os.path.join(DATA, filename)):
             root, ext = os.path.splitext(filename)
-            tests.setdefault(root, []).append(ext)
+            all_tests.setdefault(root, []).append(ext)
 
     def add_tests(cls, method_name, *extensions):
-        for test in cls.tests:
-            available_extensions = cls.tests[test]
+        for test in cls.all_tests:
+            available_extensions = cls.all_tests[test]
             for ext in extensions:
                 if ext not in available_extensions:
@@ -23,5 +23,11 @@
                     getattr(self, '_'+method_name)(test, *filenames)
                 test = test.replace('-', '_')
-                test_method.__name__ = '%s_%s' % (method_name, test)
+                try:
+                    test_method.__name__ = '%s_%s' % (method_name, test)
+                except TypeError:
+                    import new
+                    test_method = new.function(test_method.func_code, test_method.func_globals,
+                            '%s_%s' % (method_name, test), test_method.func_defaults,
+                            test_method.func_closure)
                 setattr(cls, test_method.__name__, test_method)
     add_tests = classmethod(add_tests)
