import pyparsing as pp
from .bibdatabase import BibDataStringExpression
# General helpers
def _strip_after_new_lines(s):
"""Removes leading and trailing whitespaces in all but first line."""
lines = s.splitlines()
if len(lines) > 1:
lines = [lines[0]] + [l.lstrip() for l in lines[1:]]
return '\n'.join(lines)
[docs]def strip_after_new_lines(s):
"""Removes leading and trailing whitespaces in all but first line.
:param s: string or BibDataStringExpression
"""
if isinstance(s, BibDataStringExpression):
s.apply_on_strings(_strip_after_new_lines)
return s
else:
return _strip_after_new_lines(s)
[docs]def add_logger_parse_action(expr, log_func):
"""Register a callback on expression parsing with the adequate message."""
def action(s, l, t):
log_func("Found {}: {}".format(expr.resultsName, t))
expr.addParseAction(action)
# Parse action helpers
# Helpers for returning values from the parsed tokens. Shaped as pyparsing's
# parse actions. See pyparsing documentation for the arguments.
def first_token(string_, location, token):
# TODO Handle this case correctly!
assert(len(token) == 1)
return token[0]
def remove_trailing_newlines(string_, location, token):
if token[0]:
return token[0].rstrip('\n')
def remove_braces(string_, location, token):
if len(token[0]) < 1:
return ''
else:
start = 1 if token[0][0] == '{' else 0
end = -1 if token[0][-1] == '}' else None
return token[0][start:end]
[docs]def field_to_pair(string_, location, token):
"""
Looks for parsed element named 'Field'.
:returns: (name, value).
"""
field = token.get('Field')
value = field.get('Value')
if isinstance(value, pp.ParseResults):
# For pyparsing >= 2.3.1 (see #225 and API change note in pyparsing's
# Changelog).
value = value[0]
return (field.get('FieldName'),
strip_after_new_lines(value))
# Expressions helpers
[docs]def in_braces_or_pars(exp):
"""
exp -> (exp)|{exp}
"""
return ((pp.Suppress('{') + exp + pp.Suppress('}')) |
(pp.Suppress('(') + exp + pp.Suppress(')')))
[docs]class BibtexExpression(object):
"""Gives access to pyparsing expressions.
Attributes are pyparsing expressions for the following elements:
* main_expression: the bibtex file
* string_def: a string definition
* preamble_decl: a preamble declaration
* explicit_comment: an explicit comment
* entry: an entry definition
* implicit_comment: an implicit comment
"""
ParseException = pp.ParseException
def __init__(self):
# Bibtex keywords
string_def_start = pp.CaselessKeyword("@string")
preamble_start = pp.CaselessKeyword("@preamble")
comment_line_start = pp.CaselessKeyword('@comment')
# String names
string_name = pp.Word(pp.alphanums + '_-:')('StringName')
self.set_string_name_parse_action(lambda s, l, t: None)
string_name.addParseAction(self._string_name_parse_action)
# Values inside bibtex fields
# Values can be integer or string expressions. The latter may use
# quoted or braced values.
# Integer values
integer = pp.Word(pp.nums)('Integer')
# Braced values: braced values can contain nested (but balanced) braces
braced_value_content = pp.CharsNotIn('{}')
braced_value = pp.Forward() # Recursive definition for nested braces
braced_value <<= pp.originalTextFor(
'{' + pp.ZeroOrMore(braced_value | braced_value_content) + '}'
)('BracedValue')
braced_value.setParseAction(remove_braces)
# TODO add ignore for "\}" and "\{" ?
# TODO @ are not parsed by bibtex in braces
# Quoted values: may contain braced content with balanced braces
brace_in_quoted = pp.nestedExpr('{', '}', ignoreExpr=None)
text_in_quoted = pp.CharsNotIn('"{}')
# (quotes should be escaped by braces in quoted value)
quoted_value = pp.originalTextFor(
'"' + pp.ZeroOrMore(text_in_quoted | brace_in_quoted) + '"'
)('QuotedValue')
quoted_value.addParseAction(pp.removeQuotes)
# String expressions
string_expr = pp.delimitedList(
(quoted_value | braced_value | string_name), delim='#'
)('StringExpression')
self.set_string_expression_parse_action(lambda s, l, t: None)
string_expr.addParseAction(self._string_expr_parse_action)
value = (integer | string_expr)('Value')
# Entries
# @EntryType { ...
entry_type = (pp.Suppress('@') + pp.Word(pp.alphas))('EntryType')
entry_type.setParseAction(first_token)
# Entry key: any character up to a ',' without leading and trailing
# spaces. Also exclude spaces and prevent it from being empty.
key = pp.SkipTo(',')('Key') # TODO Maybe also exclude @',\#}{~%
def citekeyParseAction(string_, location, token):
"""Parse action for validating citekeys.
It ensures citekey is not empty and has no space.
:args: see pyparsing documentation.
"""
key = first_token(string_, location, token).strip()
if len(key) < 1:
raise self.ParseException(
string_, loc=location, msg="Empty citekeys are not allowed.")
for i, c in enumerate(key):
if c.isspace():
raise self.ParseException(
string_, loc=(location + i),
msg="Whitespace not allowed in citekeys.")
return key
key.setParseAction(citekeyParseAction)
# Field name: word of letters, digits, dashes and underscores
field_name = pp.Word(pp.alphanums + '_-().+')('FieldName')
field_name.setParseAction(first_token)
# Field: field_name = value
field = pp.Group(field_name + pp.Suppress('=') + value)('Field')
field.setParseAction(field_to_pair)
# List of fields: comma separeted fields
field_list = (pp.delimitedList(field) + pp.Suppress(pp.Optional(','))
)('Fields')
field_list.setParseAction(
lambda s, l, t: {k: v for (k, v) in reversed(t.get('Fields'))})
# Entry: type, key, and fields
self.entry = (entry_type +
in_braces_or_pars(key + pp.Suppress(',') + field_list)
)('Entry')
# Other stuff: comments, string definitions, and preamble declarations
# Explicit comments: @comment + everything up to next valid declaration
# starting on new line.
not_an_implicit_comment = (pp.LineEnd() + pp.Literal('@')
) | pp.StringEnd()
self.explicit_comment = (
pp.Suppress(comment_line_start) +
pp.originalTextFor(pp.SkipTo(not_an_implicit_comment),
asString=True))('ExplicitComment')
self.explicit_comment.addParseAction(remove_trailing_newlines)
self.explicit_comment.addParseAction(remove_braces)
# Previous implementation included comment until next '}'.
# This is however not inline with bibtex behavior that is to only
# ignore until EOL. Brace stipping is arbitrary here but avoids
# duplication on bibtex write.
# Empty implicit_comments lead to infinite loop of zeroOrMore
def mustNotBeEmpty(t):
if not t[0]:
raise pp.ParseException("Match must not be empty.")
# Implicit comments: not anything else
self.implicit_comment = pp.originalTextFor(
pp.SkipTo(not_an_implicit_comment).setParseAction(mustNotBeEmpty),
asString=True)('ImplicitComment')
self.implicit_comment.addParseAction(remove_trailing_newlines)
# String definition
self.string_def = (pp.Suppress(string_def_start) + in_braces_or_pars(
string_name +
pp.Suppress('=') +
string_expr('StringValue')
))('StringDefinition')
# Preamble declaration
self.preamble_decl = (pp.Suppress(preamble_start) +
in_braces_or_pars(value))('PreambleDeclaration')
# Main bibtex expression
self.main_expression = pp.ZeroOrMore(
self.string_def |
self.preamble_decl |
self.explicit_comment |
self.entry |
self.implicit_comment)
[docs] def add_log_function(self, log_fun):
"""Add notice to logger on entry, comment, preamble, string definitions.
:param log_fun: logger function
"""
for e in [self.entry,
self.implicit_comment,
self.explicit_comment,
self.preamble_decl,
self.string_def]:
add_logger_parse_action(e, log_fun)
[docs] def set_string_name_parse_action(self, fun):
"""Set the parseAction for string name expression.
.. Note::
For some reason pyparsing duplicates the string_name
expression so setting its parseAction a posteriori has no effect
in the context of a string expression. This is why this function
should be used instead.
"""
self._string_name_parse_action_fun = fun
def _string_name_parse_action(self, s, l, t):
return self._string_name_parse_action_fun(s, l, t)
[docs] def set_string_expression_parse_action(self, fun):
"""Set the parseAction for string_expression expression.
.. Note::
See set_string_name_parse_action.
"""
self._string_expr_parse_action_fun = fun
def _string_expr_parse_action(self, s, l, t):
return self._string_expr_parse_action_fun(s, l, t)
def parseFile(self, file_obj):
return self.main_expression.parseFile(file_obj, parseAll=True)