| 1 | |
|---|
| 2 | """ |
|---|
| 3 | yaml.py |
|---|
| 4 | |
|---|
| 5 | Lexer for YAML, a human-friendly data serialization language |
|---|
| 6 | (http://yaml.org/). |
|---|
| 7 | |
|---|
| 8 | Written by Kirill Simonov <xi@resolvent.net>. |
|---|
| 9 | |
|---|
| 10 | License: Whatever suitable for inclusion into the Pygments package. |
|---|
| 11 | """ |
|---|
| 12 | |
|---|
| 13 | from pygments.lexer import \ |
|---|
| 14 | ExtendedRegexLexer, LexerContext, include, bygroups |
|---|
| 15 | from pygments.token import \ |
|---|
| 16 | Text, Comment, Punctuation, Name, Literal |
|---|
| 17 | |
|---|
| 18 | __all__ = ['YAMLLexer'] |
|---|
| 19 | |
|---|
| 20 | |
|---|
| 21 | class YAMLLexerContext(LexerContext): |
|---|
| 22 | """Indentation context for the YAML lexer.""" |
|---|
| 23 | |
|---|
| 24 | def __init__(self, *args, **kwds): |
|---|
| 25 | super(YAMLLexerContext, self).__init__(*args, **kwds) |
|---|
| 26 | self.indent_stack = [] |
|---|
| 27 | self.indent = -1 |
|---|
| 28 | self.next_indent = 0 |
|---|
| 29 | self.block_scalar_indent = None |
|---|
| 30 | |
|---|
| 31 | |
|---|
| 32 | def something(TokenClass): |
|---|
| 33 | """Do not produce empty tokens.""" |
|---|
| 34 | def callback(lexer, match, context): |
|---|
| 35 | text = match.group() |
|---|
| 36 | if not text: |
|---|
| 37 | return |
|---|
| 38 | yield match.start(), TokenClass, text |
|---|
| 39 | context.pos = match.end() |
|---|
| 40 | return callback |
|---|
| 41 | |
|---|
| 42 | def reset_indent(TokenClass): |
|---|
| 43 | """Reset the indentation levels.""" |
|---|
| 44 | def callback(lexer, match, context): |
|---|
| 45 | text = match.group() |
|---|
| 46 | context.indent_stack = [] |
|---|
| 47 | context.indent = -1 |
|---|
| 48 | context.next_indent = 0 |
|---|
| 49 | context.block_scalar_indent = None |
|---|
| 50 | yield match.start(), TokenClass, text |
|---|
| 51 | context.pos = match.end() |
|---|
| 52 | return callback |
|---|
| 53 | |
|---|
| 54 | def save_indent(TokenClass, start=False): |
|---|
| 55 | """Save a possible indentation level.""" |
|---|
| 56 | def callback(lexer, match, context): |
|---|
| 57 | text = match.group() |
|---|
| 58 | extra = '' |
|---|
| 59 | if start: |
|---|
| 60 | context.next_indent = len(text) |
|---|
| 61 | if context.next_indent < context.indent: |
|---|
| 62 | while context.next_indent < context.indent: |
|---|
| 63 | context.indent = context.indent_stack.pop() |
|---|
| 64 | if context.next_indent > context.indent: |
|---|
| 65 | extra = text[context.indent:] |
|---|
| 66 | text = text[:context.indent] |
|---|
| 67 | else: |
|---|
| 68 | context.next_indent += len(text) |
|---|
| 69 | if text: |
|---|
| 70 | yield match.start(), TokenClass, text |
|---|
| 71 | if extra: |
|---|
| 72 | yield match.start()+len(text), TokenClass.Error, extra |
|---|
| 73 | context.pos = match.end() |
|---|
| 74 | return callback |
|---|
| 75 | |
|---|
| 76 | def set_indent(TokenClass, implicit=False): |
|---|
| 77 | """Set the previously saved indentation level.""" |
|---|
| 78 | def callback(lexer, match, context): |
|---|
| 79 | text = match.group() |
|---|
| 80 | if context.indent < context.next_indent: |
|---|
| 81 | context.indent_stack.append(context.indent) |
|---|
| 82 | context.indent = context.next_indent |
|---|
| 83 | if not implicit: |
|---|
| 84 | context.next_indent += len(text) |
|---|
| 85 | yield match.start(), TokenClass, text |
|---|
| 86 | context.pos = match.end() |
|---|
| 87 | return callback |
|---|
| 88 | |
|---|
| 89 | def set_block_scalar_indent(TokenClass): |
|---|
| 90 | """Set an explicit indentation level for a block scalar.""" |
|---|
| 91 | def callback(lexer, match, context): |
|---|
| 92 | text = match.group() |
|---|
| 93 | context.block_scalar_indent = None |
|---|
| 94 | if not text: |
|---|
| 95 | return |
|---|
| 96 | increment = match.group(1) |
|---|
| 97 | if increment: |
|---|
| 98 | current_indent = max(context.indent, 0) |
|---|
| 99 | increment = int(increment) |
|---|
| 100 | context.block_scalar_indent = current_indent + increment |
|---|
| 101 | if text: |
|---|
| 102 | yield match.start(), TokenClass, text |
|---|
| 103 | context.pos = match.end() |
|---|
| 104 | return callback |
|---|
| 105 | |
|---|
| 106 | def parse_block_scalar_empty_line(IndentTokenClass, ContentTokenClass): |
|---|
| 107 | """Process an empty line in a block scalar.""" |
|---|
| 108 | def callback(lexer, match, context): |
|---|
| 109 | text = match.group() |
|---|
| 110 | if (context.block_scalar_indent is None or |
|---|
| 111 | len(text) <= context.block_scalar_indent): |
|---|
| 112 | if text: |
|---|
| 113 | yield match.start(), IndentTokenClass, text |
|---|
| 114 | else: |
|---|
| 115 | indentation = text[:context.block_scalar_indent] |
|---|
| 116 | content = text[context.block_scalar_indent:] |
|---|
| 117 | yield match.start(), IndentTokenClass, indentation |
|---|
| 118 | yield (match.start()+context.block_scalar_indent, |
|---|
| 119 | ContentTokenClass, content) |
|---|
| 120 | context.pos = match.end() |
|---|
| 121 | return callback |
|---|
| 122 | |
|---|
| 123 | def parse_block_scalar_indent(TokenClass): |
|---|
| 124 | """Process indentation spaces in a block scalar.""" |
|---|
| 125 | def callback(lexer, match, context): |
|---|
| 126 | text = match.group() |
|---|
| 127 | if context.block_scalar_indent is None: |
|---|
| 128 | if len(text) <= max(context.indent, 0): |
|---|
| 129 | context.stack.pop() |
|---|
| 130 | context.stack.pop() |
|---|
| 131 | return |
|---|
| 132 | context.block_scalar_indent = len(text) |
|---|
| 133 | else: |
|---|
| 134 | if len(text) < context.block_scalar_indent: |
|---|
| 135 | context.stack.pop() |
|---|
| 136 | context.stack.pop() |
|---|
| 137 | return |
|---|
| 138 | if text: |
|---|
| 139 | yield match.start(), TokenClass, text |
|---|
| 140 | context.pos = match.end() |
|---|
| 141 | return callback |
|---|
| 142 | |
|---|
| 143 | def parse_plain_scalar_indent(TokenClass): |
|---|
| 144 | """Process indentation spaces in a plain scalar.""" |
|---|
| 145 | def callback(lexer, match, context): |
|---|
| 146 | text = match.group() |
|---|
| 147 | if len(text) <= context.indent: |
|---|
| 148 | context.stack.pop() |
|---|
| 149 | context.stack.pop() |
|---|
| 150 | return |
|---|
| 151 | if text: |
|---|
| 152 | yield match.start(), TokenClass, text |
|---|
| 153 | context.pos = match.end() |
|---|
| 154 | return callback |
|---|
| 155 | |
|---|
| 156 | |
|---|
| 157 | class YAMLLexer(ExtendedRegexLexer): |
|---|
| 158 | """Lexer for the YAML language.""" |
|---|
| 159 | |
|---|
| 160 | name = 'YAML' |
|---|
| 161 | aliases = ['yaml'] |
|---|
| 162 | filenames = ['*.yaml', '*.yml'] |
|---|
| 163 | mimetypes = ['text/x-yaml'] |
|---|
| 164 | |
|---|
| 165 | tokens = { |
|---|
| 166 | |
|---|
| 167 | # the root rules |
|---|
| 168 | 'root': [ |
|---|
| 169 | # ignored whitespaces |
|---|
| 170 | (r'[ ]+(?=#|$)', Text.Blank), |
|---|
| 171 | # line breaks |
|---|
| 172 | (r'\n+', Text.Break), |
|---|
| 173 | # a comment |
|---|
| 174 | (r'#[^\n]*', Comment.Single), |
|---|
| 175 | # the '%YAML' directive |
|---|
| 176 | (r'^%YAML(?=[ ]|$)', reset_indent(Name.Directive), |
|---|
| 177 | 'yaml-directive'), |
|---|
| 178 | # the %TAG directive |
|---|
| 179 | (r'^%TAG(?=[ ]|$)', reset_indent(Name.Directive), |
|---|
| 180 | 'tag-directive'), |
|---|
| 181 | # document start and document end indicators |
|---|
| 182 | (r'^(?:---|\.\.\.)(?=[ ]|$)', |
|---|
| 183 | reset_indent(Punctuation.Document), 'block-line'), |
|---|
| 184 | # indentation spaces |
|---|
| 185 | (r'[ ]*(?![ \t\n\r\f\v]|$)', |
|---|
| 186 | save_indent(Text.Indent, start=True), |
|---|
| 187 | ('block-line', 'indentation')), |
|---|
| 188 | ], |
|---|
| 189 | |
|---|
| 190 | # trailing whitespaces after directives or a block scalar indicator |
|---|
| 191 | 'ignored-line': [ |
|---|
| 192 | # ignored whitespaces |
|---|
| 193 | (r'[ ]+(?=#|$)', Text.Blank), |
|---|
| 194 | # a comment |
|---|
| 195 | (r'#[^\n]*', Comment.Single), |
|---|
| 196 | # line break |
|---|
| 197 | (r'\n', Text.Break, '#pop:2'), |
|---|
| 198 | ], |
|---|
| 199 | |
|---|
| 200 | # the %YAML directive |
|---|
| 201 | 'yaml-directive': [ |
|---|
| 202 | # the version number |
|---|
| 203 | (r'([ ]+)([0-9]+\.[0-9]+)', |
|---|
| 204 | bygroups(Text.Blank, Literal.Version), 'ignored-line'), |
|---|
| 205 | ], |
|---|
| 206 | |
|---|
| 207 | # the %YAG directive |
|---|
| 208 | 'tag-directive': [ |
|---|
| 209 | # a tag handle and the corresponding prefix |
|---|
| 210 | (r'([ ]+)(!|![0-9A-Za-z_-]*!)' |
|---|
| 211 | r'([ ]+)(!|!?[0-9A-Za-z;/?:@&=+$,_.!~*\'()\[\]%-]+)', |
|---|
| 212 | bygroups(Text.Blank, Name.Type, Text.Blank, Name.Type), |
|---|
| 213 | 'ignored-line'), |
|---|
| 214 | ], |
|---|
| 215 | |
|---|
| 216 | # block scalar indicators and indentation spaces |
|---|
| 217 | 'indentation': [ |
|---|
| 218 | # trailing whitespaces are ignored |
|---|
| 219 | (r'[ ]*$', something(Text.Blank), '#pop:2'), |
|---|
| 220 | # whitespaces preceeding block collection indicators |
|---|
| 221 | (r'[ ]+(?=[?:-](?:[ ]|$))', save_indent(Text.Indent)), |
|---|
| 222 | # block collection indicators |
|---|
| 223 | (r'[?:-](?=[ ]|$)', set_indent(Punctuation.Indicator)), |
|---|
| 224 | # the beginning a block line |
|---|
| 225 | (r'[ ]*', save_indent(Text.Indent), '#pop'), |
|---|
| 226 | ], |
|---|
| 227 | |
|---|
| 228 | # an indented line in the block context |
|---|
| 229 | 'block-line': [ |
|---|
| 230 | # the line end |
|---|
| 231 | (r'[ ]*(?=#|$)', something(Text.Blank), '#pop'), |
|---|
| 232 | # whitespaces separating tokens |
|---|
| 233 | (r'[ ]+', Text.Blank), |
|---|
| 234 | # tags, anchors and aliases, |
|---|
| 235 | include('descriptors'), |
|---|
| 236 | # block collections and scalars |
|---|
| 237 | include('block-nodes'), |
|---|
| 238 | # flow collections and quoted scalars |
|---|
| 239 | include('flow-nodes'), |
|---|
| 240 | # a plain scalar |
|---|
| 241 | (r'(?=[^ \t\n\r\f\v?:,\[\]{}#&*!|>\'"%@`-]|[?:-][^ \t\n\r\f\v])', |
|---|
| 242 | something(Literal.Scalar.Plain), |
|---|
| 243 | 'plain-scalar-in-block-context'), |
|---|
| 244 | ], |
|---|
| 245 | |
|---|
| 246 | # tags, anchors, aliases |
|---|
| 247 | 'descriptors' : [ |
|---|
| 248 | # a full-form tag |
|---|
| 249 | (r'!<[0-9A-Za-z;/?:@&=+$,_.!~*\'()\[\]%-]+>', Name.Type), |
|---|
| 250 | # a tag in the form '!', '!suffix' or '!handle!suffix' |
|---|
| 251 | (r'!(?:[0-9A-Za-z_-]+)?' |
|---|
| 252 | r'(?:![0-9A-Za-z;/?:@&=+$,_.!~*\'()\[\]%-]+)?', Name.Type), |
|---|
| 253 | # an anchor |
|---|
| 254 | (r'&[0-9A-Za-z_-]+', Name.Anchor), |
|---|
| 255 | # an alias |
|---|
| 256 | (r'\*[0-9A-Za-z_-]+', Name.Alias), |
|---|
| 257 | ], |
|---|
| 258 | |
|---|
| 259 | # block collections and scalars |
|---|
| 260 | 'block-nodes': [ |
|---|
| 261 | # implicit key |
|---|
| 262 | (r':(?=[ ]|$)', set_indent(Punctuation.Indicator, implicit=True)), |
|---|
| 263 | # literal and folded scalars |
|---|
| 264 | (r'[|>]', Punctuation.Indicator, |
|---|
| 265 | ('block-scalar-content', 'block-scalar-header')), |
|---|
| 266 | ], |
|---|
| 267 | |
|---|
| 268 | # flow collections and quoted scalars |
|---|
| 269 | 'flow-nodes': [ |
|---|
| 270 | # a flow sequence |
|---|
| 271 | (r'\[', Punctuation.Indicator, 'flow-sequence'), |
|---|
| 272 | # a flow mapping |
|---|
| 273 | (r'\{', Punctuation.Indicator, 'flow-mapping'), |
|---|
| 274 | # a single-quoted scalar |
|---|
| 275 | (r'\'', Literal.Scalar.Flow.Quote, 'single-quoted-scalar'), |
|---|
| 276 | # a double-quoted scalar |
|---|
| 277 | (r'\"', Literal.Scalar.Flow.Quote, 'double-quoted-scalar'), |
|---|
| 278 | ], |
|---|
| 279 | |
|---|
| 280 | # the content of a flow collection |
|---|
| 281 | 'flow-collection': [ |
|---|
| 282 | # whitespaces |
|---|
| 283 | (r'[ ]+', Text.Blank), |
|---|
| 284 | # line breaks |
|---|
| 285 | (r'\n+', Text.Break), |
|---|
| 286 | # a comment |
|---|
| 287 | (r'#[^\n]*', Comment.Single), |
|---|
| 288 | # simple indicators |
|---|
| 289 | (r'[?:,]', Punctuation.Indicator), |
|---|
| 290 | # tags, anchors and aliases |
|---|
| 291 | include('descriptors'), |
|---|
| 292 | # nested collections and quoted scalars |
|---|
| 293 | include('flow-nodes'), |
|---|
| 294 | # a plain scalar |
|---|
| 295 | (r'(?=[^ \t\n\r\f\v?:,\[\]{}#&*!|>\'"%@`])', |
|---|
| 296 | something(Literal.Scalar.Plain), |
|---|
| 297 | 'plain-scalar-in-flow-context'), |
|---|
| 298 | ], |
|---|
| 299 | |
|---|
| 300 | # a flow sequence indicated by '[' and ']' |
|---|
| 301 | 'flow-sequence': [ |
|---|
| 302 | # include flow collection rules |
|---|
| 303 | include('flow-collection'), |
|---|
| 304 | # the closing indicator |
|---|
| 305 | (r'\]', Punctuation.Indicator, '#pop'), |
|---|
| 306 | ], |
|---|
| 307 | |
|---|
| 308 | # a flow mapping indicated by '{' and '}' |
|---|
| 309 | 'flow-mapping': [ |
|---|
| 310 | # include flow collection rules |
|---|
| 311 | include('flow-collection'), |
|---|
| 312 | # the closing indicator |
|---|
| 313 | (r'\}', Punctuation.Indicator, '#pop'), |
|---|
| 314 | ], |
|---|
| 315 | |
|---|
| 316 | # block scalar lines |
|---|
| 317 | 'block-scalar-content': [ |
|---|
| 318 | # line break |
|---|
| 319 | (r'\n', Text.Break), |
|---|
| 320 | # empty line |
|---|
| 321 | (r'^[ ]+$', |
|---|
| 322 | parse_block_scalar_empty_line(Text.Indent, |
|---|
| 323 | Literal.Scalar.Block)), |
|---|
| 324 | # indentation spaces (we may leave the state here) |
|---|
| 325 | (r'^[ ]*', parse_block_scalar_indent(Text.Indent)), |
|---|
| 326 | # line content |
|---|
| 327 | (r'[^\n\r\f\v]+', Literal.Scalar.Block), |
|---|
| 328 | ], |
|---|
| 329 | |
|---|
| 330 | # the content of a literal or folded scalar |
|---|
| 331 | 'block-scalar-header': [ |
|---|
| 332 | # indentation indicator followed by chomping flag |
|---|
| 333 | (r'([1-9])?[+-]?(?=[ ]|$)', |
|---|
| 334 | set_block_scalar_indent(Punctuation.Indicator), |
|---|
| 335 | 'ignored-line'), |
|---|
| 336 | # chomping flag followed by indentation indicator |
|---|
| 337 | (r'[+-]?([1-9])?(?=[ ]|$)', |
|---|
| 338 | set_block_scalar_indent(Punctuation.Indicator), |
|---|
| 339 | 'ignored-line'), |
|---|
| 340 | ], |
|---|
| 341 | |
|---|
| 342 | # ignored and regular whitespaces in quoted scalars |
|---|
| 343 | 'quoted-scalar-whitespaces': [ |
|---|
| 344 | # leading and trailing whitespaces are ignored |
|---|
| 345 | (r'^[ ]+|[ ]+$', Text.Blank), |
|---|
| 346 | # line breaks are ignored |
|---|
| 347 | (r'\n+', Text.Break), |
|---|
| 348 | # other whitespaces are a part of the value |
|---|
| 349 | (r'[ ]+', Literal.Scalar.Flow), |
|---|
| 350 | ], |
|---|
| 351 | |
|---|
| 352 | # single-quoted scalars |
|---|
| 353 | 'single-quoted-scalar': [ |
|---|
| 354 | # include whitespace and line break rules |
|---|
| 355 | include('quoted-scalar-whitespaces'), |
|---|
| 356 | # escaping of the quote character |
|---|
| 357 | (r'\'\'', Literal.Scalar.Flow.Escape), |
|---|
| 358 | # regular non-whitespace characters |
|---|
| 359 | (r'[^ \t\n\r\f\v\']+', Literal.Scalar.Flow), |
|---|
| 360 | # the closing quote |
|---|
| 361 | (r'\'', Literal.Scalar.Flow.Quote, '#pop'), |
|---|
| 362 | ], |
|---|
| 363 | |
|---|
| 364 | # double-quoted scalars |
|---|
| 365 | 'double-quoted-scalar': [ |
|---|
| 366 | # include whitespace and line break rules |
|---|
| 367 | include('quoted-scalar-whitespaces'), |
|---|
| 368 | # escaping of special characters |
|---|
| 369 | (r'\\[0abt\tn\nvfre "\\N_LP]', Literal.Scalar.Flow.Escape), |
|---|
| 370 | # escape codes |
|---|
| 371 | (r'\\(?:x[0-9A-Fa-f]{2}|u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8})', |
|---|
| 372 | Literal.Scalar.Flow.Escape), |
|---|
| 373 | # regular non-whitespace characters |
|---|
| 374 | (r'[^ \t\n\r\f\v\"\\]+', Literal.Scalar.Flow), |
|---|
| 375 | # the closing quote |
|---|
| 376 | (r'"', Literal.Scalar.Flow.Quote, '#pop'), |
|---|
| 377 | ], |
|---|
| 378 | |
|---|
| 379 | # the beginning of a new line while scanning a plain scalar |
|---|
| 380 | 'plain-scalar-in-block-context-new-line': [ |
|---|
| 381 | # empty lines |
|---|
| 382 | (r'^[ ]+$', Text.Blank), |
|---|
| 383 | # line breaks |
|---|
| 384 | (r'\n+', Text.Break), |
|---|
| 385 | # document start and document end indicators |
|---|
| 386 | (r'^(?=---|\.\.\.)', something(Punctuation.Document), '#pop:3'), |
|---|
| 387 | # indentation spaces (we may leave the block line state here) |
|---|
| 388 | (r'^[ ]*', parse_plain_scalar_indent(Text.Indent), '#pop'), |
|---|
| 389 | ], |
|---|
| 390 | |
|---|
| 391 | # a plain scalar in the block context |
|---|
| 392 | 'plain-scalar-in-block-context': [ |
|---|
| 393 | # the scalar ends with the ':' indicator |
|---|
| 394 | (r'[ ]*(?=:[ ]|:$)', something(Text.Blank), '#pop'), |
|---|
| 395 | # the scalar ends with whitespaces followed by a comment |
|---|
| 396 | (r'[ ]+(?=#)', Text.Blank, '#pop'), |
|---|
| 397 | # trailing whitespaces are ignored |
|---|
| 398 | (r'[ ]+$', Text.Blank), |
|---|
| 399 | # line breaks are ignored |
|---|
| 400 | (r'\n+', Text.Break, 'plain-scalar-in-block-context-new-line'), |
|---|
| 401 | # other whitespaces are a part of the value |
|---|
| 402 | (r'[ ]+', Literal.Scalar.Plain), |
|---|
| 403 | # regular non-whitespace characters |
|---|
| 404 | (r'(?::(?![ \t\n\r\f\v])|[^ \t\n\r\f\v:])+', |
|---|
| 405 | Literal.Scalar.Plain), |
|---|
| 406 | ], |
|---|
| 407 | |
|---|
| 408 | # a plain scalar is the flow context |
|---|
| 409 | 'plain-scalar-in-flow-context': [ |
|---|
| 410 | # the scalar ends with an indicator character |
|---|
| 411 | (r'[ ]*(?=[,:?\[\]{}])', something(Text.Blank), '#pop'), |
|---|
| 412 | # the scalar ends with a comment |
|---|
| 413 | (r'[ ]+(?=#)', Text.Blank, '#pop'), |
|---|
| 414 | # leading and trailing whitespaces are ignored |
|---|
| 415 | (r'^[ ]+|[ ]+$', Text.Blank), |
|---|
| 416 | # line breaks are ignored |
|---|
| 417 | (r'\n+', Text.Break), |
|---|
| 418 | # other whitespaces are a part of the value |
|---|
| 419 | (r'[ ]+', Literal.Scalar.Plain), |
|---|
| 420 | # regular non-whitespace characters |
|---|
| 421 | (r'[^ \t\n\r\f\v,:?\[\]{}]+', Literal.Scalar.Plain), |
|---|
| 422 | ], |
|---|
| 423 | |
|---|
| 424 | } |
|---|
| 425 | |
|---|
| 426 | def get_tokens_unprocessed(self, text=None, context=None): |
|---|
| 427 | if context is None: |
|---|
| 428 | context = YAMLLexerContext(text, 0) |
|---|
| 429 | return super(YAMLLexer, self).get_tokens_unprocessed(text, context) |
|---|
| 430 | |
|---|
| 431 | |
|---|