Changeset 48 for branches/pyyaml3000/lib/yaml/scanner.py
- Timestamp:
- 02/18/06 18:52:18 (7 years ago)
- File:
-
- 1 edited
-
branches/pyyaml3000/lib/yaml/scanner.py (modified) (15 diffs)
Legend:
- Unmodified
- Added
- Removed
-
branches/pyyaml3000/lib/yaml/scanner.py
r47 r48 25 25 # ^ 26 26 def __init__(self, context=None, context_marker=None, 27 problem=None, problem_marker=None , description=None):27 problem=None, problem_marker=None): 28 28 self.context = context 29 29 self.context_marker = context_marker 30 30 self.problem = problem 31 31 self.problem_marker = problem_marker 32 self.description = description33 32 34 33 def __str__(self): … … 40 39 if marker is not None: 41 40 lines.append(str(marker)) 42 if self.description is not None:43 lines.append(self.description)44 41 return '\n'.join(lines) 45 42 … … 63 60 # 64 61 # Reader supports the following methods 65 # self.reader.peek(k=1) # peek the next k characters 66 # self.reader.forward(k=1) # read the next k characters and move the 67 # # pointer 62 # self.reader.peek(i=0) # peek the next i-th character 63 # self.reader.prefix(l=1) # peek the next l characters 64 # self.reader.forward(l=1) # read the next l characters 65 # and move the pointer 68 66 self.reader = reader 69 67 … … 162 160 ch = self.reader.peek() 163 161 164 # Is it the end of reader?162 # Is it the end of stream? 165 163 if ch == u'\0': 166 return self.fetch_end() 164 return self.fetch_stream_end() 165 166 # Is it the byte order mark? 167 if ch == u'\uFEFF': 168 return self.fetch_bom() 167 169 168 170 # Is it a directive? … … 241 243 242 244 # No? It's an error. Let's produce a nice error message. 243 self.invalid_token() 245 raise ScannerError("while scanning for the next token", None, 246 "found character %r that cannot start any token" 247 % ch.encode('utf-8'), self.reader.get_marker()) 244 248 245 249 # Simple keys treatment. … … 343 347 # Fetchers. 344 348 345 def fetch_ end(self):349 def fetch_stream_end(self): 346 350 347 351 # Set the current intendation to -1. … … 360 364 # The reader is ended. 361 365 self.done = True 366 367 def fetch_bom(self): 368 # We consider the BOM marker as a DOCUMENT-END indicator unless it's 369 # the first character in the stream. It's a reasonable approximation 370 # of the specification requirements. We can follow the specification 371 # literally, but it will require a new token class. Probably later. 372 373 # We ignore BOM if it is the first character in the stream. 374 if self.reader.index == 0: 375 slef.reader.forward() 376 377 # Otherwise we issue DOCUMENT-END. 378 else: 379 380 # Set the current intendation to -1. 381 self.unwind_indent(-1) 382 383 # Reset simple keys. Note that there could not be a block 384 # collection after BOM. 385 self.remove_possible_simple_key() 386 self.allow_simple_key = False 387 388 # Add DOCUMENT-END. 389 start_marker = self.reader.get_marker() 390 self.reader.forward() 391 end_marker = self.reader.get_marker() 392 self.tokens.append(DocumentEndToken(start_marker, end_marker)) 362 393 363 394 def fetch_directive(self): … … 638 669 # DOCUMENT-START: ^ '---' (' '|'\n') 639 670 if self.reader.column == 0: 640 prefix = self.reader.peek(4)641 if prefix[:3] == u'---' and prefix[3]in u'\0 \t\r\n\x85\u2028\u2029':671 if self.reader.prefix(3) == u'---' \ 672 and self.reader.peek(3) in u'\0 \t\r\n\x85\u2028\u2029': 642 673 return True 643 674 … … 647 678 if self.reader.column == 0: 648 679 prefix = self.reader.peek(4) 649 if prefix[:3] == u'...' and prefix[3] in u'\0 \t\r\n\x85\u2028\u2029': 680 if self.reader.prefix(3) == u'...' \ 681 and self.reader.peek(3) in u'\0 \t\r\n\x85\u2028\u2029': 650 682 return True 651 683 … … 658 690 # ENTRY(block context): '-' (' '|'\n') 659 691 else: 660 prefix = self.reader.peek(2)661 return prefix[0] == u'-' and prefix[1]in u'\0 \t\r\n\x85\u2028\u2029'692 return self.reader.peek() == u'-' \ 693 and self.reader.peek(1) in u'\0 \t\r\n\x85\u2028\u2029' 662 694 663 695 def check_key(self): … … 669 701 # KEY(block context): '?' (' '|'\n') 670 702 else: 671 prefix = self.reader.peek(2) 672 return prefix[1] in u'\0 \t\r\n\x85\u2028\u2029' 703 return self.reader.peek(1) in u'\0 \t\r\n\x85\u2028\u2029' 673 704 674 705 def check_value(self): … … 680 711 # VALUE(block context): ':' (' '|'\n') 681 712 else: 682 prefix = self.reader.peek(2) 683 return prefix[1] in u'\0 \t\r\n\x85\u2028\u2029' 713 return self.reader.peek(1) in u'\0 \t\r\n\x85\u2028\u2029' 684 714 685 715 def check_plain(self): 686 return True 716 717 # A plain scalar may start with any non-space character except: 718 # '-', '?', ':', ',', '[', ']', '{', '}', 719 # '#', '&', '*', '!', '|', '>', '\'', '\"', 720 # '%', '@', '`'. 721 # 722 # It may also start with 723 # '-', '?', ':' 724 # if it is followed by a non-space character. 725 # 726 # Note that we limit the last rule to the block context (except the 727 # '-' character) because we want the flow context to be space 728 # independent. 729 ch = self.reader.peek() 730 return ch not in u'\0 \t\r\n\x85\u2028\u2029-?:,[]{}#&*!|>\'\"%@`' \ 731 or (self.reader.peek(1) not in u'\0 \t\r\n\x85\u2028\u2029' 732 and (ch == '-' or (not self.flow_level and ch in u'?:'))) 687 733 688 734 # Scanners. … … 706 752 707 753 def scan_directive(self): 708 marker = self.reader.get_marker() 709 if self.reader.peek(5) == u'%YAML ': 710 token = YAMLDirectiveToken(1, 1, marker, marker) 711 elif self.reader.peek(4) == u'%TAG ': 712 token = TagDirectiveToken(marker, marker) 754 # See the specification for details. 755 start_marker = self.reader.get_marker() 756 self.reader.forward() 757 name = self.scan_directive_name(start_marker) 758 value = None 759 if name == u'YAML': 760 value = self.scan_yaml_directive_value(start_marker) 761 end_marker = self.reader.get_marker() 762 elif name == u'TAG': 763 value = self.scan_tag_directive_value(start_marker) 764 end_marker = self.reader.get_marker() 713 765 else: 714 token = ReservedDirectiveToken('', marker, marker) 715 while self.reader.peek() not in u'\0\r\n': 766 end_marker = self.reader.get_marker() 767 while self.reader.peek() not in u'\0\r\n\x85\u2028\u2029': 768 self.reader.forward() 769 self.scan_directive_ignored_line(start_marker) 770 return DirectiveToken(name, value, start_marker, end_marker) 771 772 def scan_directive_name(self, start_marker): 773 # See the specification for details. 774 length = 0 775 ch = self.reader.peek(length) 776 while u'0' <= ch <= u'9' or u'A' <= ch <= 'Z' or u'a' <= ch <= 'z' \ 777 or ch in u'-_': 778 length += 1 779 ch = self.reader.peek(length) 780 if not length: 781 raise ScannerError("while scanning a directive", start_marker, 782 "expected directive name, but found %r" % ch.encode('utf-8'), 783 self.reader.get_marker()) 784 value = self.reader.prefix(length) 785 self.reader.forward(length) 786 ch = self.reader.peek() 787 if ch not in u'\0 \r\n\x85\u2028\u2029': 788 raise ScannerError("while scanning a directive" % name, start_marker, 789 "expected alphabetic or numeric character, but found %r" 790 % ch.encode('utf-8'), self.reader.get_marker()) 791 return value 792 793 def scan_yaml_directive_value(self, start_marker): 794 # See the specification for details. 795 while self.reader.peek() == u' ': 716 796 self.reader.forward() 797 major = self.scan_yaml_directive_number(start_marker) 798 if self.reader.peek() != '.': 799 raise ScannerError("while scanning a directive", start_marker, 800 "expected a digit or '.', but found %r" % ch.encode('utf-8'), 801 self.reader.get_marker()) 717 802 self.reader.forward() 718 return token 803 minor = self.scan_yaml_directive_number(start_marker) 804 if self.reader.peek() not in u'\0 \r\n\x85\u2028\u2029': 805 raise ScannerError("while scanning a directive", start_marker, 806 "expected a digit or ' ', but found %r" % ch.encode('utf-8'), 807 self.reader.get_marker()) 808 return (major, minor) 809 810 def scan_yaml_directive_number(self, start_marker): 811 # See the specification for details. 812 ch = self.reader.peek() 813 if not (u'0' <= ch <= '9'): 814 raise ScannerError("while scanning a directive", start_marker, 815 "expected a digit, but found %r" % ch.encode('utf-8'), 816 self.reader.get_marker()) 817 length = 0 818 while u'0' <= self.reader.peek(length) <= u'9': 819 length += 1 820 value = int(self.reader.prefix(length)) 821 self.reader.forward(length) 822 return value 823 824 def scan_tag_directive_value(self, start_marker): 825 # See the specification for details. 826 while self.reader.peek() == u' ': 827 self.reader.forward() 828 handle = self.scan_tag_directive_handle(start_marker) 829 while self.reader.peek() == u' ': 830 self.reader.forward() 831 prefix = self.scan_tag_directive_prefix(start_marker) 832 return (handle, prefix) 833 834 def scan_tag_directive_handle(self, start_marker): 835 # See the specification for details. 836 value = self.scan_tag_handle('directive', start_marker) 837 if self.reader.peek() != u' ': 838 raise ScannerError("while scanning a directive", start_marker, 839 "expected ' ', but found %r" % ch.encode('utf-8'), 840 self.reader.get_marker()) 841 return value 842 843 def scan_tag_directive_prefix(self, start_marker): 844 # See the specification for details. 845 value = self.scan_tag_uri('directive', start_marker) 846 ch = self.reader.peek() 847 if ch not in u'\0 \r\n\x85\u2028\u2029': 848 raise ScannerError("while scanning a directive", start_marker, 849 "expected ' ', but found %r" % ch.encode('utf-8'), 850 self.reader.get_marker()) 851 return value 852 853 def scan_directive_ignored_line(self, start_marker): 854 # See the specification for details. 855 while self.reader.peek() == u' ': 856 self.reader.forward() 857 if self.reader.peek() == u'#': 858 while self.reader.peek() not in u'\0\r\n\x85\u2028\u2029': 859 self.reader.forward() 860 ch = self.reader.peek() 861 if ch not in u'\0\r\n\x85\u2028\u2029': 862 raise ScannerError("while scanning a directive", start_marker, 863 "expected a comment or a line break, but found %r" 864 % ch.encode('utf-8'), self.reader.get_marker()) 865 self.scan_line_break() 719 866 720 867 def scan_anchor(self, TokenClass): 868 # The specification does not restrict characters for anchors and 869 # aliases. This may lead to problems, for instance, the document: 870 # [ *alias, value ] 871 # can be interpteted in two ways, as 872 # [ "value" ] 873 # and 874 # [ *alias , "value" ] 875 # Therefore we restrict aliases to numbers and ASCII letters. 721 876 start_marker = self.reader.get_marker() 722 while self.reader.peek() not in u'\0 \t\r\n,:': 877 indicator = self.reader.peek() 878 if indicator == '*': 879 name = 'alias' 880 else: 881 name = 'anchor' 882 self.reader.forward() 883 length = 0 884 ch = self.reader.peek(length) 885 while u'0' <= ch <= u'9' or u'A' <= ch <= 'Z' or u'a' <= ch <= 'z' \ 886 or ch in u'-_': 887 length += 1 888 ch = self.reader.peek(length) 889 if not length: 890 raise ScannerError("while scanning an %s" % name, start_marker, 891 "expected anchor name, but found %r" % ch.encode('utf-8'), 892 self.reader.get_marker()) 893 value = self.reader.prefix(length) 894 self.reader.forward(length) 895 ch = self.reader.peek() 896 if ch not in u'\0 \t\r\n\x85\u2028\u2029?:,]}%@`': 897 raise ScannerError("while scanning an %s" % name, start_marker, 898 "expected alphabetic or numeric character, but found %r" 899 % ch.encode('utf-8'), self.reader.get_marker()) 900 end_marker = self.reader.get_marker() 901 return TokenClass(value, start_marker, end_marker) 902 903 def scan_tag(self): 904 # See the specification for details. 905 start_marker = self.reader.get_marker() 906 ch = self.reader.peek(1) 907 if ch == u'<': 908 handle = None 909 self.reader.forward(2) 910 suffix = self.scan_tag_uri('tag', start_marker) 911 if self.reader.peek() != u'>': 912 raise ScannerError("while parsing a tag", start_marking, 913 "expected '>', but got %r" % self.reader.peek().encode('utf-8'), 914 self.reader.get_marker()) 723 915 self.reader.forward() 916 elif ch in u'\0 \t\r\n\x85\u2028\u2029': 917 handle = None 918 suffix = u'!' 919 self.reader.forward() 920 else: 921 length = 1 922 use_handle = False 923 while ch not in u'\0 \r\n\x85\u2028\u2029': 924 if ch == u'!': 925 use_handle = True 926 break 927 length += 1 928 ch = self.reader.peek(length) 929 handle = u'!' 930 if use_handle: 931 handle = self.scan_tag_handle('tag', start_marker) 932 else: 933 handle = u'!' 934 self.reader.forward() 935 suffix = self.scan_tag_uri('tag', start_marker) 936 ch = self.reader.peek() 937 if ch not in u'\0 \r\n\x85\u2028\u2029': 938 raise ScannerError("while scanning a tag", start_marker, 939 "expected ' ', but found %r" % ch.encode('utf-8'), 940 self.reader.get_marker()) 941 value = (handle, suffix) 724 942 end_marker = self.reader.get_marker() 725 return TokenClass('', start_marker, end_marker) 726 727 def scan_tag(self): 943 return TagToken(value, start_marker, end_marker) 944 945 def scan_block_scalar(self, folded): 946 # See the specification for details. 947 948 chunks = [] 728 949 start_marker = self.reader.get_marker() 729 while self.reader.peek() not in u'\0 \t\r\n': 950 951 # Scan the header. 952 self.reader.forward() 953 chomping, increment = self.scan_block_scalar_indicators(start_marker) 954 self.scan_block_scalar_ignored_line(start_marker) 955 956 # Determine the indentation level and go to the first non-empty line. 957 min_indent = self.indent+1 958 if min_indent < 1: 959 min_indent = 1 960 if increment is None: 961 breaks, max_indent, end_marker = self.scan_block_scalar_indentation() 962 indent = max(min_indent, max_indent) 963 else: 964 indent = min_indent+increment-1 965 breaks, end_marker = self.scan_block_scalar_breaks(indent) 966 line_break = u'' 967 968 # Scan the inner part of the block scalar. 969 while self.reader.column == indent and self.reader.peek() != u'\0': 970 chunks.extend(breaks) 971 leading_non_space = self.reader.peek() not in u' \t' 972 length = 0 973 while self.reader.peek(length) not in u'\0\r\n\x85\u2028\u2029': 974 length += 1 975 chunks.append(self.reader.prefix(length)) 976 self.reader.forward(length) 977 line_break = self.scan_line_break() 978 breaks, end_marker = self.scan_block_scalar_breaks(indent) 979 if self.reader.column == indent and self.reader.peek() != u'\0': 980 # Unfortunately, folding rules are ambiguous. 981 # 982 # This is the folding according to the specification: 983 # 984 #if folded and line_break == u'\n' \ 985 # and leading_non_space and self.reader.peek() not in u' \t': 986 # if not breaks: 987 # chunks.append(u' ') 988 #else: 989 # chunks.append(line_break) 990 # 991 # This is Clark Evans's interpretation (also in the spec 992 # examples): 993 # 994 if folded and line_break == u'\n': 995 if not breaks: 996 if self.reader.peek() not in ' \t': 997 chunks.append(u' ') 998 else: 999 chunks.append(line_break) 1000 else: 1001 chunks.append(line_break) 1002 else: 1003 break 1004 1005 # Chomp the tail. 1006 if chomping is not False: 1007 chunks.append(line_break) 1008 if chomping is True: 1009 chunks.extend(breaks) 1010 1011 # We are done. 1012 return ScalarToken(u''.join(chunks), False, start_marker, end_marker) 1013 1014 def scan_block_scalar_indicators(self, start_marker): 1015 # See the specification for details. 1016 chomping = None 1017 increment = None 1018 ch = self.reader.peek() 1019 if ch in u'+-': 1020 if ch == '+': 1021 chomping = True 1022 else: 1023 chomping = False 730 1024 self.reader.forward() 1025 ch = self.reader.peek() 1026 if ch in u'0123456789': 1027 increment = int(ch) 1028 if increment == 0: 1029 raise ScannerError("while scanning a block scalar", start_marker, 1030 "expected indentation indicator in the range 1-9, but found 0", 1031 self.reader.get_marker()) 1032 self.reader.forward() 1033 elif ch in u'0123456789': 1034 increment = int(ch) 1035 if increment == 0: 1036 raise ScannerError("while scanning a block scalar", start_marker, 1037 "expected indentation indicator in the range 1-9, but found 0", 1038 self.reader.get_marker()) 1039 self.reader.forward() 1040 ch = self.reader.peek() 1041 if ch in u'+-': 1042 if ch == '+': 1043 chomping = True 1044 else: 1045 chomping = False 1046 self.reader.forward() 1047 ch = self.reader.peek() 1048 if ch not in u'\0 \r\n\x85\u2028\u2029': 1049 raise ScannerError("while scanning a block scalar", start_marker, 1050 "expected chomping or indentation indicators, but found %r" 1051 % ch.encode('utf-8'), self.reader.get_marker()) 1052 return chomping, increment 1053 1054 def scan_block_scalar_ignored_line(self, start_marker): 1055 # See the specification for details. 1056 while self.reader.peek() == u' ': 1057 self.reader.forward() 1058 if self.reader.peek() == u'#': 1059 while self.reader.peek() not in u'\0\r\n\x85\u2028\u2029': 1060 self.reader.forward() 1061 ch = self.reader.peek() 1062 if ch not in u'\0\r\n\x85\u2028\u2029': 1063 raise ScannerError("while scanning a block scalar", start_marker, 1064 "expected a comment or a line break, but found %r" 1065 % ch.encode('utf-8'), self.reader.get_marker()) 1066 self.scan_line_break() 1067 1068 def scan_block_scalar_indentation(self): 1069 # See the specification for details. 1070 chunks = [] 1071 max_indent = 0 731 1072 end_marker = self.reader.get_marker() 732 return TagToken('', start_marker, end_marker) 733 734 def scan_block_scalar(self, folded): 1073 while self.reader.peek() in u' \r\n\x85\u2028\u2029': 1074 if self.reader.peek() != u' ': 1075 chunks.append(self.scan_line_break()) 1076 end_marker = self.reader.get_marker() 1077 else: 1078 self.reader.forward() 1079 if self.reader.column > max_indent: 1080 max_indent = self.reader.column 1081 return chunks, max_indent, end_marker 1082 1083 def scan_block_scalar_breaks(self, indent): 1084 # See the specification for details. 1085 chunks = [] 1086 end_marker = self.reader.get_marker() 1087 while self.reader.column < indent and self.reader.peek() == u' ': 1088 self.reader.forward() 1089 while self.reader.peek() in u'\r\n\x85\u2028\u2029': 1090 chunks.append(self.scan_line_break()) 1091 end_marker = self.reader.get_marker() 1092 while self.reader.column < indent and self.reader.peek() == u' ': 1093 self.reader.forward() 1094 return chunks, end_marker 1095 1096 def scan_flow_scalar(self, double): 1097 # See the specification for details. 1098 chunks = [] 735 1099 start_marker = self.reader.get_marker() 736 1100 indent = self.indent+1 737 if indent < 1:1101 if indent == 0: 738 1102 indent = 1 739 while True:740 while self.reader.peek() and self.reader.peek() and self.reader.peek() not in u'\0\r\n\x85\u2028\u2029':741 self.reader.forward()742 if self.reader.peek() != u'\0':743 self.reader.forward()744 count = 0745 while count < indent and self.reader.peek() == u' ':746 self.reader.forward()747 count += 1748 if count < indent and self.reader.peek() not in u'#\r\n\x85\u2028\u2029':749 break750 return ScalarToken('', False, start_marker, start_marker)751 752 def scan_flow_scalar(self, double):753 marker = self.reader.get_marker()754 1103 quote = self.reader.peek() 755 1104 self.reader.forward() 1105 chunks.extend(self.scan_flow_scalar_non_spaces(double, indent, start_marker)) 756 1106 while self.reader.peek() != quote: 757 if double and self.reader.peek() == u'\\': 1107 chunks.extend(self.scan_flow_scalar_spaces(double, indent, start_marker)) 1108 chunks.extend(self.scan_flow_scalar_non_spaces(double, indent, start_marker)) 1109 self.reader.forward() 1110 end_marker = self.reader.get_marker() 1111 return ScalarToken(u''.join(chunks), False, start_marker, end_marker) 1112 1113 ESCAPE_REPLACEMENTS = { 1114 u'0': u'\0', 1115 u'a': u'\x07', 1116 u'b': u'\x08', 1117 u't': u'\x09', 1118 u'\t': u'\x09', 1119 u'n': u'\x0A', 1120 u'v': u'\x0B', 1121 u'f': u'\x0C', 1122 u'r': u'\x0D', 1123 u'e': u'\x1B', 1124 u' ': u'\x20', 1125 u'\"': u'\"', 1126 u'\\': u'\\', 1127 u'N': u'\x85', 1128 u'_': u'\xA0', 1129 u'L': u'\u2028', 1130 u'P': u'\u2029', 1131 } 1132 1133 ESCAPE_CODES = { 1134 u'x': 2, 1135 u'u': 4, 1136 u'U': 8, 1137 } 1138 1139 def scan_flow_scalar_non_spaces(self, double, indent, start_marker): 1140 # See the specification for details. 1141 chunks = [] 1142 while True: 1143 length = 0 1144 while self.reader.peek(length) not in u'\'\"\\\0 \t\r\n\x85\u2028\u2029': 1145 length += 1 1146 if length: 1147 chunks.append(self.reader.prefix(length)) 1148 self.reader.forward(length) 1149 ch = self.reader.peek() 1150 if not double and ch == u'\'' and self.reader.peek(1) == u'\'': 1151 chunks.append(u'\'') 758 1152 self.reader.forward(2) 759 elif not double and self.reader.peek(3)[1:] == u'\'\'': 760 self.reader.forward(3) 1153 elif (double and ch == u'\'') or (not double and ch in u'\"\\'): 1154 chunks.append(ch) 1155 self.reader.forward() 1156 elif double and ch == u'\\': 1157 self.reader.forward() 1158 ch = self.reader.peek() 1159 if ch in self.ESCAPE_REPLACEMENTS: 1160 chunks.append(self.ESCAPE_REPLACEMENTS[ch]) 1161 self.reader.forward() 1162 elif ch in self.ESCAPE_CODES: 1163 length = self.ESCAPE_CODES[ch] 1164 self.reader.forward() 1165 for k in range(length): 1166 if self.reader.peek(k) not in u'0123456789ABCDEFabcdef': 1167 raise ScannerError("while scanning a double-quoted scalar", start_marker, 1168 "expected escape sequence of %d hexdecimal numbers, but found %r" % 1169 (length, self.reader.peek(k).encode('utf-8')), self.reader.get_marker()) 1170 code = int(self.reader.prefix(length), 16) 1171 chunks.append(unichr(code)) 1172 self.reader.forward(length) 1173 elif ch in u'\r\n\x85\u2028\u2029': 1174 self.scan_line_break() 1175 chunks.extend(self.scan_flow_scalar_breaks(double, indent, start_marker)) 1176 else: 1177 raise ScannerError("while scanning a double-quoted scalar", start_marker, 1178 "found unknown escape character %r" % ch.encode('utf-8'), self.reader.get_marker()) 761 1179 else: 762 self.reader.forward(1) 763 self.reader.forward(1) 764 return ScalarToken('', False, marker, marker) 1180 return chunks 1181 1182 def scan_flow_scalar_spaces(self, double, indent, start_marker): 1183 # See the specification for details. 1184 chunks = [] 1185 length = 0 1186 while self.reader.peek(length) in u' \t': 1187 length += 1 1188 whitespaces = self.reader.prefix(length) 1189 self.reader.forward(length) 1190 ch = self.reader.peek() 1191 if ch == u'\0': 1192 raise ScannerError("while scanning a quoted scalar", start_marker, 1193 "found unexpected end of stream", self.reader.get_marker()) 1194 elif ch in u'\r\n\x85\u2028\u2029': 1195 line_break = self.scan_line_break() 1196 breaks = self.scan_flow_scalar_breaks(double, indent, start_marker) 1197 if line_break != u'\n': 1198 chunks.append(line_break) 1199 elif not breaks: 1200 chunks.append(u' ') 1201 chunks.extend(breaks) 1202 else: 1203 chunks.append(whitespaces) 1204 return chunks 1205 1206 def scan_flow_scalar_breaks(self, double, indent, start_marker): 1207 # See the specification for details. 1208 chunks = [] 1209 while True: 1210 while self.reader.column < indent and self.reader.peek() == u' ': 1211 self.reader.forward() 1212 if self.reader.column < indent \ 1213 and self.reader.peek() not in u'\0\r\n\x85\u2028\u2029': 1214 s = 's' 1215 if indent == 1: 1216 s = '' 1217 raise ScannerError("while scanning a quoted scalar", start_marker, 1218 "expected %d space%s indentation, but found %r" 1219 % (indent, s, self.reader.peek().encode('utf-8')), 1220 self.reader.get_marker()) 1221 while self.reader.peek() in u' \t': 1222 self.reader.forward() 1223 if self.reader.peek() in u'\r\n\x85\u2028\u2029': 1224 chunks.append(self.scan_line_break()) 1225 else: 1226 return chunks 765 1227 766 1228 def scan_plain(self): 1229 # See the specification for details. 1230 # We add an additional restriction for the flow context: 1231 # plain scalars in the flow context cannot contain ':' and '?'. 1232 # We also keep track of the `allow_simple_key` flag here. 1233 chunks = [] 1234 start_marker = self.reader.get_marker() 1235 end_marker = start_marker 767 1236 indent = self.indent+1 768 if indent < 1:1237 if indent == 0: 769 1238 indent = 1 770 space = False 1239 spaces = [] 1240 while True: 1241 length = 0 1242 if self.reader.peek() == u'#': 1243 break 1244 while True: 1245 ch = self.reader.peek(length) 1246 if ch in u'\0 \t\r\n\x85\u2028\u2029' \ 1247 or (not self.flow_level and ch == u':' and 1248 self.reader.peek(length+1) in u'\0 \t\r\n\x28\u2028\u2029') \ 1249 or (self.flow_level and ch in u',:?[]{}'): 1250 break 1251 length += 1 1252 if length == 0: 1253 break 1254 self.allow_simple_key = False 1255 chunks.extend(spaces) 1256 chunks.append(self.reader.prefix(length)) 1257 self.reader.forward(length) 1258 end_marker = self.reader.get_marker() 1259 spaces = self.scan_plain_spaces(indent) 1260 if not spaces or self.reader.peek() == u'#' \ 1261 or self.reader.column < indent: 1262 break 1263 return ScalarToken(u''.join(chunks), True, start_marker, end_marker) 1264 1265 def scan_plain_spaces(self, indent): 1266 # See the specification for details. 1267 # The specification is really confusing about tabs in plain scalars. 1268 # We just forbid them completely. Do not use tabs in YAML! 1269 chunks = [] 1270 length = 0 1271 while self.reader.peek(length) in u' ': 1272 length += 1 1273 whitespaces = self.reader.prefix(length) 1274 self.reader.forward(length) 1275 ch = self.reader.peek() 1276 if ch in u'\r\n\x85\u2028\u2029': 1277 line_break = self.scan_line_break() 1278 self.allow_simple_key = True 1279 breaks = [] 1280 while self.reader.peek() in u' \r\n\x85\u2028\u2029': 1281 if self.reader.peek() == ' ': 1282 self.reader.forward() 1283 else: 1284 breaks.append(self.scan_line_break()) 1285 if line_break != u'\n': 1286 chunks.append(line_break) 1287 elif not breaks: 1288 chunks.append(u' ') 1289 chunks.extend(breaks) 1290 elif whitespaces: 1291 chunks.append(whitespaces) 1292 return chunks 1293 1294 def scan_tag_handle(self, name, start_marker): 1295 # See the specification for details. 1296 # For some strange reasons, the specification does not allow '_' in 1297 # tag handles. I have allowed it anyway. 1298 if self.reader.peek() != u'!': 1299 raise ScannerError("while scanning a %s" % name, start_marker, 1300 "expected '!', but found %r" % ch.encode('utf-8'), 1301 self.reader.get_marker()) 1302 length = 1 1303 ch = self.reader.peek(length) 1304 if ch != u' ': 1305 while u'0' <= ch <= u'9' or u'A' <= ch <= 'Z' or u'a' <= ch <= 'z' \ 1306 or ch in u'-_': 1307 length += 1 1308 ch = self.reader.peek(length) 1309 if ch != u'!': 1310 self.reader.forward(length) 1311 raise ScannerError("while scanning a %s" % name, start_marker, 1312 "expected '!', but found %r" % ch.encode('utf-8'), 1313 self.reader.get_marker()) 1314 length += 1 1315 value = self.reader.prefix(length) 1316 self.reader.forward(length) 1317 return value 1318 1319 def scan_tag_uri(self, name, start_marker): 1320 # See the specification for details. 1321 # Note: we do not check if URI is well-formed. 1322 chunks = [] 1323 length = 0 1324 ch = self.reader.peek(length) 1325 while u'0' <= ch <= u'9' or u'A' <= ch <= 'Z' or u'a' <= ch <= 'z' \ 1326 or ch in u'-;/?:@&=+$,_.!~*\'()[]%': 1327 if ch == u'%': 1328 chunks.append(self.reader.prefix(length)) 1329 self.reader.forward(length) 1330 length = 0 1331 chunks.append(self.scan_uri_escapes(name, start_marker)) 1332 else: 1333 length += 1 1334 ch = self.reader.peek(length) 1335 if length: 1336 chunks.append(self.reader.prefix(length)) 1337 self.reader.forward(length) 1338 length = 0 1339 if not chunks: 1340 raise ScannerError("while parsing a %s" % name, start_marker, 1341 "expected URI, but found %r" % ch.encode('utf-8'), 1342 self.reader.get_marker()) 1343 return u''.join(chunks) 1344 1345 def scan_uri_escapes(self, name, start_marker): 1346 # See the specification for details. 1347 bytes = [] 771 1348 marker = self.reader.get_marker() 772 while True: 773 while self.reader.peek() == u' ': 774 self.reader.forward() 775 space = True 776 while self.reader.peek() not in u'\0\r\n?:,[]{}#' \ 777 or (not space and self.reader.peek() == '#') \ 778 or (not self.flow_level and self.reader.peek() in '?,[]{}') \ 779 or (not self.flow_level and self.reader.peek() == ':' and self.reader.peek(2)[1] not in u' \0\r\n'): 780 space = self.reader.peek() not in u' \t' 781 self.reader.forward() 782 self.allow_simple_key = False 783 if self.reader.peek() not in u'\r\n': 784 break 785 while self.reader.peek() in u'\r\n': 786 self.reader.forward() 787 if not self.flow_level: 788 self.allow_simple_key = True 789 count = 0 790 while self.reader.peek() == u' ' and count < indent: 791 self.reader.forward() 792 count += 1 793 if count < indent: 794 break 795 space = True 796 return ScalarToken('', True, marker, marker) 1349 while self.reader.peek() == u'%': 1350 self.reader.forward() 1351 for k in range(2): 1352 if self.reader.peek(k) not in u'0123456789ABCDEFabcdef': 1353 raise ScannerError("while scanning a %s" % name, start_marker, 1354 "expected URI escape sequence of 2 hexdecimal numbers, but found %r" % 1355 (self.reader.peek(k).encode('utf-8')), self.reader.get_marker()) 1356 bytes.append(chr(int(self.reader.prefix(2), 16))) 1357 self.reader.forward(2) 1358 try: 1359 value = unicode(''.join(bytes), 'utf-8') 1360 except UnicodeDecodeError, exc: 1361 raise ScannerError("while scanning a %s" % name, start_marker, str(exc), marker) 1362 return value 797 1363 798 1364 def scan_line_break(self): … … 807 1373 ch = self.reader.peek() 808 1374 if ch in u'\r\n\x85': 809 if self.reader.p eek(2) == u'\r\n':1375 if self.reader.prefix(2) == u'\r\n': 810 1376 self.forward(2) 811 1377 else: … … 817 1383 return u'' 818 1384 819 def invalid_token(self):820 self.fail("invalid token")821 822 1385 #try: 823 1386 # import psyco
Note: See TracChangeset
for help on using the changeset viewer.
