""" yaml.py Lexer for YAML, a human-friendly data serialization language (http://yaml.org/). Written by Kirill Simonov . License: Whatever suitable for inclusion into the Pygments package. """ from pygments.lexer import \ ExtendedRegexLexer, LexerContext, include, bygroups from pygments.token import \ Text, Comment, Punctuation, Name, Literal __all__ = ['YAMLLexer'] class YAMLLexerContext(LexerContext): """Indentation context for the YAML lexer.""" def __init__(self, *args, **kwds): super(YAMLLexerContext, self).__init__(*args, **kwds) self.indent_stack = [] self.indent = -1 self.next_indent = 0 self.block_scalar_indent = None def something(TokenClass): """Do not produce empty tokens.""" def callback(lexer, match, context): text = match.group() if not text: return yield match.start(), TokenClass, text context.pos = match.end() return callback def reset_indent(TokenClass): """Reset the indentation levels.""" def callback(lexer, match, context): text = match.group() context.indent_stack = [] context.indent = -1 context.next_indent = 0 context.block_scalar_indent = None yield match.start(), TokenClass, text context.pos = match.end() return callback def save_indent(TokenClass, start=False): """Save a possible indentation level.""" def callback(lexer, match, context): text = match.group() extra = '' if start: context.next_indent = len(text) if context.next_indent < context.indent: while context.next_indent < context.indent: context.indent = context.indent_stack.pop() if context.next_indent > context.indent: extra = text[context.indent:] text = text[:context.indent] else: context.next_indent += len(text) if text: yield match.start(), TokenClass, text if extra: yield match.start()+len(text), TokenClass.Error, extra context.pos = match.end() return callback def set_indent(TokenClass, implicit=False): """Set the previously saved indentation level.""" def callback(lexer, match, context): text = match.group() if context.indent < context.next_indent: context.indent_stack.append(context.indent) context.indent = context.next_indent if not implicit: context.next_indent += len(text) yield match.start(), TokenClass, text context.pos = match.end() return callback def set_block_scalar_indent(TokenClass): """Set an explicit indentation level for a block scalar.""" def callback(lexer, match, context): text = match.group() context.block_scalar_indent = None if not text: return increment = match.group(1) if increment: current_indent = max(context.indent, 0) increment = int(increment) context.block_scalar_indent = current_indent + increment if text: yield match.start(), TokenClass, text context.pos = match.end() return callback def parse_block_scalar_empty_line(IndentTokenClass, ContentTokenClass): """Process an empty line in a block scalar.""" def callback(lexer, match, context): text = match.group() if (context.block_scalar_indent is None or len(text) <= context.block_scalar_indent): if text: yield match.start(), IndentTokenClass, text else: indentation = text[:context.block_scalar_indent] content = text[context.block_scalar_indent:] yield match.start(), IndentTokenClass, indentation yield (match.start()+context.block_scalar_indent, ContentTokenClass, content) context.pos = match.end() return callback def parse_block_scalar_indent(TokenClass): """Process indentation spaces in a block scalar.""" def callback(lexer, match, context): text = match.group() if context.block_scalar_indent is None: if len(text) <= max(context.indent, 0): context.stack.pop() context.stack.pop() return context.block_scalar_indent = len(text) else: if len(text) < context.block_scalar_indent: context.stack.pop() context.stack.pop() return if text: yield match.start(), TokenClass, text context.pos = match.end() return callback def parse_plain_scalar_indent(TokenClass): """Process indentation spaces in a plain scalar.""" def callback(lexer, match, context): text = match.group() if len(text) <= context.indent: context.stack.pop() context.stack.pop() return if text: yield match.start(), TokenClass, text context.pos = match.end() return callback class YAMLLexer(ExtendedRegexLexer): """Lexer for the YAML language.""" name = 'YAML' aliases = ['yaml'] filenames = ['*.yaml', '*.yml'] mimetypes = ['text/x-yaml'] tokens = { # the root rules 'root': [ # ignored whitespaces (r'[ ]+(?=#|$)', Text.Blank), # line breaks (r'\n+', Text.Break), # a comment (r'#[^\n]*', Comment.Single), # the '%YAML' directive (r'^%YAML(?=[ ]|$)', reset_indent(Name.Directive), 'yaml-directive'), # the %TAG directive (r'^%TAG(?=[ ]|$)', reset_indent(Name.Directive), 'tag-directive'), # document start and document end indicators (r'^(?:---|\.\.\.)(?=[ ]|$)', reset_indent(Punctuation.Document), 'block-line'), # indentation spaces (r'[ ]*(?![ \t\n\r\f\v]|$)', save_indent(Text.Indent, start=True), ('block-line', 'indentation')), ], # trailing whitespaces after directives or a block scalar indicator 'ignored-line': [ # ignored whitespaces (r'[ ]+(?=#|$)', Text.Blank), # a comment (r'#[^\n]*', Comment.Single), # line break (r'\n', Text.Break, '#pop:2'), ], # the %YAML directive 'yaml-directive': [ # the version number (r'([ ]+)([0-9]+\.[0-9]+)', bygroups(Text.Blank, Literal.Version), 'ignored-line'), ], # the %YAG directive 'tag-directive': [ # a tag handle and the corresponding prefix (r'([ ]+)(!|![0-9A-Za-z_-]*!)' r'([ ]+)(!|!?[0-9A-Za-z;/?:@&=+$,_.!~*\'()\[\]%-]+)', bygroups(Text.Blank, Name.Type, Text.Blank, Name.Type), 'ignored-line'), ], # block scalar indicators and indentation spaces 'indentation': [ # trailing whitespaces are ignored (r'[ ]*$', something(Text.Blank), '#pop:2'), # whitespaces preceding block collection indicators (r'[ ]+(?=[?:-](?:[ ]|$))', save_indent(Text.Indent)), # block collection indicators (r'[?:-](?=[ ]|$)', set_indent(Punctuation.Indicator)), # the beginning a block line (r'[ ]*', save_indent(Text.Indent), '#pop'), ], # an indented line in the block context 'block-line': [ # the line end (r'[ ]*(?=#|$)', something(Text.Blank), '#pop'), # whitespaces separating tokens (r'[ ]+', Text.Blank), # tags, anchors and aliases, include('descriptors'), # block collections and scalars include('block-nodes'), # flow collections and quoted scalars include('flow-nodes'), # a plain scalar (r'(?=[^ \t\n\r\f\v?:,\[\]{}#&*!|>\'"%@`-]|[?:-][^ \t\n\r\f\v])', something(Literal.Scalar.Plain), 'plain-scalar-in-block-context'), ], # tags, anchors, aliases 'descriptors' : [ # a full-form tag (r'!<[0-9A-Za-z;/?:@&=+$,_.!~*\'()\[\]%-]+>', Name.Type), # a tag in the form '!', '!suffix' or '!handle!suffix' (r'!(?:[0-9A-Za-z_-]+)?' r'(?:![0-9A-Za-z;/?:@&=+$,_.!~*\'()\[\]%-]+)?', Name.Type), # an anchor (r'&[0-9A-Za-z_-]+', Name.Anchor), # an alias (r'\*[0-9A-Za-z_-]+', Name.Alias), ], # block collections and scalars 'block-nodes': [ # implicit key (r':(?=[ ]|$)', set_indent(Punctuation.Indicator, implicit=True)), # literal and folded scalars (r'[|>]', Punctuation.Indicator, ('block-scalar-content', 'block-scalar-header')), ], # flow collections and quoted scalars 'flow-nodes': [ # a flow sequence (r'\[', Punctuation.Indicator, 'flow-sequence'), # a flow mapping (r'\{', Punctuation.Indicator, 'flow-mapping'), # a single-quoted scalar (r'\'', Literal.Scalar.Flow.Quote, 'single-quoted-scalar'), # a double-quoted scalar (r'\"', Literal.Scalar.Flow.Quote, 'double-quoted-scalar'), ], # the content of a flow collection 'flow-collection': [ # whitespaces (r'[ ]+', Text.Blank), # line breaks (r'\n+', Text.Break), # a comment (r'#[^\n]*', Comment.Single), # simple indicators (r'[?:,]', Punctuation.Indicator), # tags, anchors and aliases include('descriptors'), # nested collections and quoted scalars include('flow-nodes'), # a plain scalar (r'(?=[^ \t\n\r\f\v?:,\[\]{}#&*!|>\'"%@`])', something(Literal.Scalar.Plain), 'plain-scalar-in-flow-context'), ], # a flow sequence indicated by '[' and ']' 'flow-sequence': [ # include flow collection rules include('flow-collection'), # the closing indicator (r'\]', Punctuation.Indicator, '#pop'), ], # a flow mapping indicated by '{' and '}' 'flow-mapping': [ # include flow collection rules include('flow-collection'), # the closing indicator (r'\}', Punctuation.Indicator, '#pop'), ], # block scalar lines 'block-scalar-content': [ # line break (r'\n', Text.Break), # empty line (r'^[ ]+$', parse_block_scalar_empty_line(Text.Indent, Literal.Scalar.Block)), # indentation spaces (we may leave the state here) (r'^[ ]*', parse_block_scalar_indent(Text.Indent)), # line content (r'[^\n\r\f\v]+', Literal.Scalar.Block), ], # the content of a literal or folded scalar 'block-scalar-header': [ # indentation indicator followed by chomping flag (r'([1-9])?[+-]?(?=[ ]|$)', set_block_scalar_indent(Punctuation.Indicator), 'ignored-line'), # chomping flag followed by indentation indicator (r'[+-]?([1-9])?(?=[ ]|$)', set_block_scalar_indent(Punctuation.Indicator), 'ignored-line'), ], # ignored and regular whitespaces in quoted scalars 'quoted-scalar-whitespaces': [ # leading and trailing whitespaces are ignored (r'^[ ]+|[ ]+$', Text.Blank), # line breaks are ignored (r'\n+', Text.Break), # other whitespaces are a part of the value (r'[ ]+', Literal.Scalar.Flow), ], # single-quoted scalars 'single-quoted-scalar': [ # include whitespace and line break rules include('quoted-scalar-whitespaces'), # escaping of the quote character (r'\'\'', Literal.Scalar.Flow.Escape), # regular non-whitespace characters (r'[^ \t\n\r\f\v\']+', Literal.Scalar.Flow), # the closing quote (r'\'', Literal.Scalar.Flow.Quote, '#pop'), ], # double-quoted scalars 'double-quoted-scalar': [ # include whitespace and line break rules include('quoted-scalar-whitespaces'), # escaping of special characters (r'\\[0abt\tn\nvfre "\\N_LP]', Literal.Scalar.Flow.Escape), # escape codes (r'\\(?:x[0-9A-Fa-f]{2}|u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8})', Literal.Scalar.Flow.Escape), # regular non-whitespace characters (r'[^ \t\n\r\f\v\"\\]+', Literal.Scalar.Flow), # the closing quote (r'"', Literal.Scalar.Flow.Quote, '#pop'), ], # the beginning of a new line while scanning a plain scalar 'plain-scalar-in-block-context-new-line': [ # empty lines (r'^[ ]+$', Text.Blank), # line breaks (r'\n+', Text.Break), # document start and document end indicators (r'^(?=---|\.\.\.)', something(Punctuation.Document), '#pop:3'), # indentation spaces (we may leave the block line state here) (r'^[ ]*', parse_plain_scalar_indent(Text.Indent), '#pop'), ], # a plain scalar in the block context 'plain-scalar-in-block-context': [ # the scalar ends with the ':' indicator (r'[ ]*(?=:[ ]|:$)', something(Text.Blank), '#pop'), # the scalar ends with whitespaces followed by a comment (r'[ ]+(?=#)', Text.Blank, '#pop'), # trailing whitespaces are ignored (r'[ ]+$', Text.Blank), # line breaks are ignored (r'\n+', Text.Break, 'plain-scalar-in-block-context-new-line'), # other whitespaces are a part of the value (r'[ ]+', Literal.Scalar.Plain), # regular non-whitespace characters (r'(?::(?![ \t\n\r\f\v])|[^ \t\n\r\f\v:])+', Literal.Scalar.Plain), ], # a plain scalar is the flow context 'plain-scalar-in-flow-context': [ # the scalar ends with an indicator character (r'[ ]*(?=[,:?\[\]{}])', something(Text.Blank), '#pop'), # the scalar ends with a comment (r'[ ]+(?=#)', Text.Blank, '#pop'), # leading and trailing whitespaces are ignored (r'^[ ]+|[ ]+$', Text.Blank), # line breaks are ignored (r'\n+', Text.Break), # other whitespaces are a part of the value (r'[ ]+', Literal.Scalar.Plain), # regular non-whitespace characters (r'[^ \t\n\r\f\v,:?\[\]{}]+', Literal.Scalar.Plain), ], } def get_tokens_unprocessed(self, text=None, context=None): if context is None: context = YAMLLexerContext(text, 0) return super(YAMLLexer, self).get_tokens_unprocessed(text, context)