Index: branches/pyyaml3000/lib/yaml/scanner.py
===================================================================
--- trunk/sandbox/my-parser/parser2.py	(revision 37)
+++ branches/pyyaml3000/lib/yaml/scanner.py	(revision 39)
@@ -1,2 +1,278 @@
+
+from marker import Marker
+from error import ParserError
+from stream import Stream
+
+class Scanner:
+
+    def __init__(self, source, data):
+        """Initialize the scanner."""
+        # The input stream. The Stream class do the dirty work of checking for
+        # BOM and converting the input data to Unicode. It also adds LF to the
+        # end if the data does not ends with an EOL character.
+        #
+        # Stream supports the following methods
+        #   self.stream.peek(k=1)   # peek the next k characters
+        #   self.stream.read(k=1)   # read the next k characters and move the
+        #                           # pointer
+        self.stream = Stream(source, data)
+
+        # Had we reached the end of the stream?
+        self.done = False
+
+        # The number of unclosed '{' and '['. `flow_level == 0` means block
+        # context.
+        self.flow_level = 0
+
+        # List of processed tokens that are not yet emitted.
+        self.tokens = []
+
+        # Number of tokens that were emitted through the `get_token` method.
+        self.tokens_taken = 0
+
+        # The current indentation level.
+        self.indent = -1
+
+        # Past indentation levels.
+        self.indents = []
+
+        # Variables related to simple key treatment.
+
+        # A simple key is a key that is not denoted by the '?' indicator.
+        # Example of simple keys:
+        #   ---
+        #   block simple key: value
+        #   ? not a simple key:
+        #   : { flow simple key: value }
+        # We emit the KEY token before all keys, so when we find a potential
+        # simple key, we try to locate the corresponding ':' indicator.
+        # Simple keys should be limited to a single line and 1024 characters.
+
+        # Can a block collection start at the current position? A block
+        # collection may start:
+        #   - at the beginning of the line (not counting spaces),
+        #   - after the block sequence indicator '-'.
+        self.allow_block_collection = True
+
+        # Can a simple key in flow context start at the current position? A
+        # simple key may start after the '{', '[', and ',' indicators.
+        self.allow_flow_simple_keys = False
+
+        # Keep track of possible simple keys. This is a dictionary. The key
+        # is `flow_level`; there can be no more that one possible simple key
+        # for each level. The value is a record of
+        #   (stream.index, stream.line, stream.column, token_number)
+        self.possible_simple_keys = {}
+
+    # Public methods:
+
+    def peek_token(self):
+        """Get the current token."""
+        while self.need_more_tokens()
+            self.fetch_more_tokens()
+        if self.tokens:
+            return self.tokens[0]
+
+    def get_token(self):
+        "Get the current token and remove it from the list."""
+        while self.need_more_tokens():
+            self.fetch_more_tokens()
+        if self.tokens:
+            self.tokens_taken += 1
+            return self.tokens.pop(0)
+
+    # Private methods:
+
+    def need_more_tokens(self):
+        if self.done:
+            return False
+        if not self.tokens:
+            return True
+        # The current token may be a potential simple key, so we
+        # need to look further.
+        if self.next_possible_simple_key() == self.tokens_taken:
+            return True
+
+    def fetch_more_tokens(self):
+
+        # Eat whitespaces and comments until we reach the next token.
+        self.find_next_token()
+
+        # Compare the current indentation and column. It may add some tokens
+        # and decrease the current indentation.
+        self.unwind_indent(self.stream.column)
+
+        # Peek the next character.
+        ch = self.stream.peek()
+
+        # Is it the end of stream?
+        if ch is None:
+            return self.fetch_end()
+
+        # Is it a directive?
+        if ch == u'%' and self.check_directive():
+            return self.fetch_directive()
+
+        # Is it the document start?
+        if ch == u'-' and self.check_document_start():
+            return self.fetch_document_start()
+
+        # Is it the document end?
+        if ch == u'.' and self.check_document_end():
+            return self.fetch_document_end()
+
+        # Note: the order of the following checks is NOT significant.
+
+        # Is it the sequence indicator?
+        if ch in u'-,' and self.check_entry():
+            return self.fetch_entry()
+
+        # Is it the flow sequence start indicator?
+        if ch == u'[':
+            return self.fetch_flow_sequence_start()
+
+        # Is it the flow mapping start indicator?
+        if ch == u'{':
+            return self.fetch_flow_mapping_start()
+
+        # Is it the flow sequence end indicator?
+        if ch == u']':
+            return self.fetch_flow_sequence_end()
+
+        # Is it the flow mapping end indicator?
+        if ch == u'}':
+            return self.fetch_flow_mapping_end()
+
+        # Is it the key indicator?
+        if ch == u'?' and self.check_key():
+            return self.fetch_key()
+
+        # Is it the value indicator?
+        if ch == u':' and self.check_value():
+            return self.fetch_value()
+
+        # Is it an alias?
+        if ch == u'*':
+            return self.fetch_alias()
+
+        # Is it an anchor?
+        if ch == u'&':
+            return self.fetch_anchor()
+
+        # Is is a tag?
+        if ch == u'!':
+            return self.fetch_tag()
+
+        # Is is a literal scalar?
+        if ch == u'|':
+            return self.fetch_literal()
+
+        # Is it a folded scalar?
+        if ch == u'>':
+            return self.fetch_folded()
+
+        # Is it a single quoted scalar?
+        if ch == u'\'':
+            return self.fetch_single()
+
+        # Is it a double quoted scalar?
+        if ch == u'\"':
+            return self.fetch_double()
+
+        # It must be a plain scalar.
+        if self.check_plain():
+            return self.fetch_plain()
+
+        # No? It's an error then. Let's produce a nice error message.
+        self.invalid_token()
+
+    def fetch_end(self):
+
+        # Set the current intendation to -1.
+        self.unwind_indents(-1)
+
+        # Reset everything (not really needed).
+        self.allow_block_collection = False
+        self.allow_flow_simple_keys = False
+        self.possible_simple_keys = {}
+
+        # Add END.
+        marker = self.stream.get_marker()
+        self.tokens.append(EndToken(marker))
+
+        # The stream is ended.
+        self.done = True
+
+    def check_directive(self):
+
+        # Checking for
+        #   /* The beginning of the line */ '%'
+        # The '%' indicator is already checked.
+        if self.stream.column == 0:
+            return True
+
+    def check_document_start(self):
+
+        # Checking for
+        #   /* The beginning of the line */ '---' /* Space or EOL */
+        if self.stream.column == 0:
+            prefix = self.stream.peek(4)
+            if prefix[:3] == u'---' and prefix[3] in u' \t\r\n\x85\u2028\u2029':
+                return True
+
+    def fetch_document_start(self):
+
+        # Set the current intendation to -1.
+        self.unwind_indents(-1)
+
+        # No block collections after '---'.
+        self.allow_block_collection = False
+
+        # No flow simple keys (not needed -- we are in the block context).
+        self.allow_flow_simple_keys = False
+
+        # Reset possible simple keys (not needed -- EOL should have reset it).
+        self.possible_simple_keys = {}
+
+        start_marker = self.stream.get_marker()
+
+        # The characters are already checked, just move forward.
+        self.stream.read(3)
+
+        end_marker = self.stream.get_marker()
+
+        # Add DOCUMENT-START.
+        self.tokens.append(DocumentStartToken(start_marker, end_marker))
+
+
+    def check_document_end(self):
+        if self.stream.column == 0:
+            prefix = self.stream.peek(4)
+            if prefix[:3] == u'...' and prefix[3] in u' \t\r\n\x85\u2028\u2029':
+                return True
+
+    def fetch_document_end(self):
+        # The same code as `fetch_document_start`.
+
+        # Set the current intendation to -1.
+        self.unwind_indents(-1)
+
+        # Reset everything (not really needed).
+        self.allow_block_collection = False
+        self.allow_flow_simple_keys = False
+        self.possible_simple_keys = {}
+
+        start_marker = self.stream.get_marker()
+
+        # The characters are already checked, just move forward.
+        self.stream.read(3)
+
+        end_marker = self.stream.get_marker()
+
+        # Add DOCUMENT-END.
+        self.tokens.append(DocumentEndToken(start_marker, end_marker))
+
+
+
 # Tokens:
 # YAML_DIRECTIVE: ^ '%' YAML ' '+ (version: \d+ '.' \d+) s-l-comments
@@ -128,5 +404,5 @@
         return error_position+error_pointer+error_message
 
-class Scanner:
+class _Scanner:
 
     def scan(self, source, data):
