# -*- coding: UTF-8 -*-
# (c) Jérôme Laheurte 2015-2019
# See LICENSE.txt
import functools
import collections
import logging
import re
from ptk.lexer import ProgressiveLexer, EOF, token, LexerPosition
from ptk.grammar import Grammar, Production, GrammarError
# production is only imported so that client code doesn't have to import it from grammar
from ptk.grammar import production # pylint: disable=W0611
from ptk.utils import Singleton, callbackByName, memoize
[docs]class ParseError(Exception):
"""
Syntax error when parsing.
:ivar token: The unexpected token.
"""
def __init__(self, grammar, tok, state, tokens):
self.token = tok
super().__init__('Unexpected token "%s" (%s) in state "%s"' % (tok.value, tok.type, sorted(state)))
self._state = state
self._expecting = set()
for terminal in grammar.tokenTypes():
if grammar.__actions__.get((state, terminal), None) is not None:
self._expecting.add(terminal)
self._tokens = tokens
@property
def position(self):
return self.token.position
[docs] def expecting(self):
"""
Returns a set of tokens types that would have been valid in input.
"""
return self._expecting
[docs] def state(self):
"""
Returns the parser state when the error was encountered.
"""
return self._state
[docs] def lastToken(self):
"""
Returns the last valid token seen before this error
"""
return self._tokens[-1]
[docs] def tokens(self):
"""
Returns all tokens seen
"""
return self._tokens
[docs]def leftAssoc(*operators):
"""
Class decorator for left associative operators. Use this to
decorate your :py:class:`Parser` class. Operators passed as
argument are assumed to have the same priority. The later you
declare associativity, the higher the priority; so the following
code
.. code-block:: python
@leftAssoc('+', '-')
@leftAssoc('*', '/')
class MyParser(LRParser):
# ...
declares '+' and '-' to be left associative, with the same
priority. '*' and '/' are also left associative, with a higher
priority than '+' and '-'.
See also the *priority* argument to :py:func:`production`.
"""
def _wrapper(cls):
cls.__precedence__.insert(0, ('left', set(operators)))
return cls
return _wrapper
[docs]def rightAssoc(*operators):
"""
Class decorator for right associative operators. Same remarks as :py:func:`leftAssoc`.
"""
def _wrapper(cls):
cls.__precedence__.insert(0, ('right', set(operators)))
return cls
return _wrapper
[docs]def nonAssoc(*operators):
"""
Class decorator for non associative operators. Same remarks as :py:func:`leftAssoc`.
"""
def _wrapper(cls):
cls.__precedence__.insert(0, ('non', set(operators)))
return cls
return _wrapper
class _StartState(metaclass=Singleton):
__reprval__ = '\u03A3'
class _ResolveError(Exception):
pass
@functools.total_ordering
class _Item(object):
__slots__ = ('production', 'dot', 'terminal', 'index', 'shouldReduce', 'expecting')
def __init__(self, prod, dot, terminal):
self.production = prod
self.dot = dot
self.terminal = terminal
self.index = None
self.shouldReduce = self.dot == len(self.production.right)
self.expecting = None if self.shouldReduce else self.production.right[self.dot]
def next(self):
"""
Returns an item with the dot advanced one position
"""
return _Item(self.production, self.dot + 1, self.terminal)
def hasPrefix(self, *tokens):
"""
Return True if the passed sequence of token types appears right before the dot
"""
if len(tokens) > self.dot:
return False
for tok1, tok2 in zip(self.production.right[self.dot - len(tokens):self.dot], tokens):
if tok1 != tok2:
return False
return True
def __repr__(self):
symbols = list(self.production.right)
symbols.insert(self.dot, '\u2022')
return '%s -> %s (%s)' % (self.production.name, ' '.join([repr(sym) for sym in symbols]), self.terminal)
def __eq__(self, other):
return (self.production, self.dot, self.terminal) == (other.production, other.dot, other.terminal)
def __lt__(self, other):
return (self.production, self.dot, self.terminal) < (other.production, other.dot, other.terminal)
def __hash__(self):
return hash((self.production, self.dot, self.terminal))
class _Accept(BaseException):
def __init__(self, result):
self.result = result
super().__init__()
_StackItem = collections.namedtuple('_StackItem', ['state', 'value', 'position'])
class _Shift(object):
def __init__(self, newState):
self.newState = newState
def doAction(self, grammar, stack, tok): # pylint: disable=W0613
stack.append(_StackItem(self.newState, tok.value, tok.position))
return True
class _Reduce(object):
def __init__(self, item):
self.item = item
self.nargs = len(item.production.right)
def doAction(self, grammar, stack, tok): # pylint: disable=W0613
pos, (callback, kwargs) = self._getCallback(stack)
self._applied(grammar, stack, callback(grammar, **kwargs), pos)
return False
def _applied(self, grammar, stack, prodVal, position):
stack.append(_StackItem(grammar.goto(stack[-1].state, self.item.production.name), prodVal, position))
def _getCallback(self, stack):
if self.nargs:
args = [stackItem.value for stackItem in stack[-self.nargs:]]
pos = stack[-self.nargs].position
stack[-self.nargs:] = []
else:
args = []
pos = stack[-1].position # Hum.
return pos, self.item.production.apply(args, pos)
[docs]class LRParser(Grammar):
"""
LR(1) parser. This class is intended to be used with a lexer class
derived from :py:class:`LexerBase`, using inheritance; it
overrides :py:func:`LexerBase.newToken` so you must inherit from
the parser first, then the lexer:
.. code-block:: python
class MyParser(LRParser, ReLexer):
# ...
"""
def __init__(self): # pylint: disable=R0914,R0912
super().__init__()
self._restartParser()
def rstack(self):
return reversed(self.__stack)
def newToken(self, tok):
try:
for action, stack in self._processToken(tok):
if action.doAction(self, stack, tok):
break
self.__tokens.append(tok)
except _Accept as exc:
self._restartParser()
return self.newSentence(exc.result)
def currentLRState(self):
for item in self.__stack[-1].state:
return item.index
def _processToken(self, tok):
while True:
action = self.__actions__.get((self.__stack[-1].state, tok.type), None)
if action is None:
raise ParseError(self, tok, self.__stack[-1].state, self.__tokens)
yield action, self.__stack
[docs] def newSentence(self, sentence): # pragma: no cover
"""
This is called when the start symbol has been reduced.
:param sentence: The value associated with the start symbol.
"""
raise NotImplementedError
@classmethod
def _createProductionParser(cls, name, priority, attrs):
return ProductionParser(callbackByName(name), priority, cls, attrs)
@classmethod
def _createReduceAction(cls, item):
return _Reduce(item)
@classmethod
def _createShiftAction(cls, state):
return _Shift(state)
@classmethod
def prepare(cls):
for prod in cls.productions():
if prod.name is _StartState:
break
else:
def acceptor(_, result):
raise _Accept(result)
prod = Production(_StartState, acceptor)
prod.addSymbol(cls._defaultStartSymbol() if cls.startSymbol is None else cls.startSymbol, name='result')
cls.__productions__.insert(0, prod)
cls.startSymbol = _StartState
super().prepare()
states, goto = cls.__computeStates(prod)
reachable = cls.__computeActions(states, goto)
logger = logging.getLogger('LRParser')
cls.__resolveConflicts(logger)
usedTokens = set([symbol for state, symbol in cls.__actions__.keys() if symbol is not EOF])
if usedTokens != cls.tokenTypes(): # pragma: no cover
logger.warning('The following tokens are not used: %s', ','.join([repr(sym) for sym in sorted(cls.tokenTypes() - usedTokens)]))
if reachable != cls.nonterminals(): # pragma: no cover
logger.warning('The following nonterminals are not reachable: %s', ','.join([repr(sym) for sym in sorted(cls.nonterminals() - reachable)]))
# Reductions only need goto entries for nonterminals
cls._goto = dict([((state, symbol), newState) for (state, symbol), newState in goto.items() if symbol not in cls.tokenTypes()])
parts = list()
if cls.nSR:
parts.append('%d shift/reduce conflicts' % cls.nSR)
if cls.nRR:
parts.append('%d reduce/reduce conflicts' % cls.nRR)
if parts:
logger.warning(', '.join(parts))
# Cast to tuple because sets are not totally ordered
for index, state in enumerate([tuple(cls._startState)] + sorted([tuple(state) for state in states if state != cls._startState])):
logger.debug('State %d', index)
for item in sorted(state):
logger.debug(' %s', item)
item.index = index
cls.__lrstates__.append(sorted(state))
logger.info('%d states.', len(states))
@classmethod
def __computeStates(cls, start):
allSyms = list(cls.tokenTypes() | cls.nonterminals())
goto = list()
cls._startState = frozenset([_Item(start, 0, EOF)])
states = set([cls._startState])
stack = [cls._startState]
while stack:
state = stack.pop()
stateClosure = cls.__itemSetClosure(state)
for symbol in allSyms:
# Compute goto(symbol, state)
nextState = frozenset([item.next() for item in stateClosure if item.expecting == symbol])
if nextState:
goto.append(((state, symbol), nextState))
if nextState not in states:
states.add(nextState)
stack.append(nextState)
return states, dict(goto)
@classmethod
def __computeActions(cls, states, goto):
cls.__actions__ = dict()
reachable = set()
tokenTypes = cls.tokenTypes()
for state in states:
for item in cls.__itemSetClosure(state):
if item.shouldReduce:
action = cls._createReduceAction(item)
reachable.add(item.production.name)
cls.__addReduceAction(state, item.terminal, action)
else:
symbol = item.production.right[item.dot]
if symbol in tokenTypes:
cls.__addShiftAction(state, symbol, cls._createShiftAction(goto[(state, symbol)]))
return reachable
@classmethod
def __shouldPreferShift(cls, logger, reduceAction, symbol):
logger.info('Shift/reduce conflict for "%s" on "%s"', reduceAction.item, symbol)
prodPrecedence = reduceAction.item.production.precedence(cls)
tokenPrecedence = cls.terminalPrecedence(symbol)
# If both precedences are specified, use priority/associativity
if prodPrecedence is not None and tokenPrecedence is not None:
prodAssoc, prodPrio = prodPrecedence
tokenAssoc, tokenPrio = tokenPrecedence
if prodPrio > tokenPrio:
logger.info('Resolving in favor of reduction because of priority')
return False
if prodPrio < tokenPrio:
logger.info('Resolving in favor of shift because of priority')
return True
if prodAssoc == tokenAssoc:
if prodAssoc == 'non':
logger.info('Resolving in favor of error because of associativity')
raise _ResolveError()
if prodAssoc == 'left':
logger.info('Resolving in favor of reduction because of associativity')
return False
logger.info('Resolving in favor of shift because of associativity')
return True
# At least one of those is not specified; use shift
logger.warning('Could not disambiguate shift/reduce conflict for "%s" on "%s"; using shift' % (reduceAction.item, symbol))
cls.nSR += 1
return True
@classmethod
def __resolveConflicts(cls, logger):
cls.nSR = 0
cls.nRR = 0
for (state, symbol), actions in sorted(cls.__actions__.items()):
action = actions.pop()
while actions:
conflicting = actions.pop()
try:
action = cls.__resolveConflict(logger, action, conflicting, symbol)
except _ResolveError:
del cls.__actions__[(state, symbol)]
break
else:
cls.__actions__[(state, symbol)] = action
@classmethod
def __resolveConflict(cls, logger, action1, action2, symbol):
if isinstance(action2, _Shift):
action1, action2 = action2, action1
if isinstance(action1, _Shift):
# Shift/reduce
return action1 if cls.__shouldPreferShift(logger, action2, symbol) else action2
# Reduce/reduce
logger.warning('Reduce/reduce conflict between "%s" and "%s"', action1.item, action2.item)
cls.nRR += 1
# Use the first one to be declared
for prod in cls.productions():
if prod == action1.item.production:
logger.warning('Using "%s', prod)
return action1
if prod == action2.item.production:
logger.warning('Using "%s', prod)
return action2
@classmethod
def __addReduceAction(cls, state, symbol, action):
cls.__actions__.setdefault((state, symbol), list()).append(action)
@classmethod
def __addShiftAction(cls, state, symbol, action):
for existing in cls.__actions__.get((state, symbol), list()):
if isinstance(existing, _Shift):
return
cls.__actions__.setdefault((state, symbol), list()).append(action)
@classmethod
def goto(cls, state, symbol):
return cls._goto[(state, symbol)]
def _restartParser(self):
self.__stack = [_StackItem(self._startState, None, LexerPosition(1, 1))]
self.__tokens = []
self.restartLexer()
@classmethod
@memoize
def __itemSetClosure(cls, items):
result = set(items)
while True:
prev = set(result)
for item in [item for item in result if not item.shouldReduce]:
symbol = item.production.right[item.dot]
if symbol not in cls.tokenTypes():
terminals = cls.first(*tuple(item.production.right[item.dot + 1:] + [item.terminal]))
for prod in (prod for prod in cls.productions() if prod.name == symbol):
for terminal in terminals:
result.add(_Item(prod, 0, terminal))
if prev == result:
break
return result
[docs]class ProductionParser(LRParser, ProgressiveLexer): # pylint: disable=R0904
# pylint: disable=C0111,C0103,R0201
def __init__(self, callback, priority, grammarClass, attributes): # pylint: disable=R0915
self.callback = callback
self.priority = priority
self.grammarClass = grammarClass
self.attributes = attributes
super().__init__()
@classmethod
def prepare(cls, **kwargs): # pylint: disable=R0915
# Obviously cannot use @production here
# When mixing async and sync parsers in the same program this may be called twice,
# because AsyncProductionParser inherits from ProductionParser
if cls.productions():
return
# DECL -> identifier "->" PRODS
prod = Production('DECL', cls.DECL)
prod.addSymbol('LEFT', 'left')
prod.addSymbol('arrow')
prod.addSymbol('PRODS', 'prods')
cls.__productions__.append(prod)
# LEFT -> identifier
prod = Production('LEFT', cls.LEFT)
prod.addSymbol('identifier', 'name')
cls.__productions__.append(prod)
# LEFT -> identifier "<" posarg ">"
prod = Production('LEFT', cls.LEFT)
prod.addSymbol('identifier', 'name')
prod.addSymbol('lchev')
prod.addSymbol('identifier', 'posarg')
prod.addSymbol('rchev')
cls.__productions__.append(prod)
# PRODS -> P
prod = Production('PRODS', cls.PRODS1)
prod.addSymbol('P', 'prodlist')
cls.__productions__.append(prod)
# PRODS -> PRODS "|" P
prod = Production('PRODS', cls.PRODS2)
prod.addSymbol('PRODS', 'prods')
prod.addSymbol('union')
prod.addSymbol('P', 'prodlist')
cls.__productions__.append(prod)
# P -> P SYM
prod = Production('P', cls.P1)
prod.addSymbol('P', 'prodlist')
prod.addSymbol('SYM', 'sym')
cls.__productions__.append(prod)
# P -> ɛ
prod = Production('P', cls.P2)
cls.__productions__.append(prod)
# SYM -> SYMNAME PROPERTIES
prod = Production('SYM', cls.SYM)
prod.addSymbol('SYMNAME', 'symname')
prod.addSymbol('PROPERTIES', 'properties')
cls.__productions__.append(prod)
# SYM -> SYMNAME repeat PROPERTIES
prod = Production('SYM', cls.SYMREP)
prod.addSymbol('SYMNAME', 'symname')
prod.addSymbol('repeat', 'repeat')
prod.addSymbol('PROPERTIES', 'properties')
cls.__productions__.append(prod)
# SYM -> SYMNAME repeat lparen identifier rparen PROPERTIES
prod = Production('SYM', cls.SYMREP)
prod.addSymbol('SYMNAME', 'symname')
prod.addSymbol('repeat', 'repeat')
prod.addSymbol('lparen')
prod.addSymbol('identifier', 'separator')
prod.addSymbol('rparen')
prod.addSymbol('PROPERTIES', 'properties')
cls.__productions__.append(prod)
# SYM -> SYMNAME repeat lparen litteral rparen PROPERTIES
prod = Production('SYM', cls.SYMREP_LIT)
prod.addSymbol('SYMNAME', 'symname')
prod.addSymbol('repeat', 'repeat')
prod.addSymbol('lparen')
prod.addSymbol('litteral', 'separator')
prod.addSymbol('rparen')
prod.addSymbol('PROPERTIES', 'properties')
cls.__productions__.append(prod)
# SYMNAME -> identifier
prod = Production('SYMNAME', cls.SYMNAME1)
prod.addSymbol('identifier', 'identifier')
cls.__productions__.append(prod)
# SYMNAME -> litteral
prod = Production('SYMNAME', cls.SYMNAME2)
prod.addSymbol('litteral', 'litteral')
cls.__productions__.append(prod)
# PROPERTIES -> ɛ
prod = Production('PROPERTIES', cls.PROPERTIES1)
cls.__productions__.append(prod)
# PROPERTIES -> lchev identifier rchev
prod = Production('PROPERTIES', cls.PROPERTIES2)
prod.addSymbol('lchev')
prod.addSymbol('identifier', 'name')
prod.addSymbol('rchev')
cls.__productions__.append(prod)
super().prepare(**kwargs)
[docs] def newSentence(self, startSymbol):
(name, posarg), prods = startSymbol
for prod in prods:
if prod.name is None:
prod.name = name
prod.posarg = posarg
self.grammarClass.__productions__.extend(prods)
# Lexer
[docs] @staticmethod
def ignore(char):
return char in ' \t\n'
@token('->')
def arrow(self, tok):
pass
@token('<')
def lchev(self, tok):
pass
@token('>')
def rchev(self, tok):
pass
@token(r'\|')
def union(self, tok):
pass
@token(r'\*|\+|\?')
def repeat(self, tok):
pass
@token(r'\(')
def lparen(self, tok):
pass
@token(r'\)')
def rparen(self, tok):
pass
@token('[a-zA-Z_][a-zA-Z0-9_]*')
def identifier(self, tok):
pass
@token(r'"|\'')
def litteral(self, tok):
class StringBuilder(object):
def __init__(self, quotetype):
self.quotetype = quotetype
self.chars = list()
self.state = 0
def feed(self, char):
if self.state == 0:
if char == '\\':
self.state = 1
elif char == self.quotetype:
return 'litteral', ''.join(self.chars)
else:
self.chars.append(char)
elif self.state == 1:
self.chars.append(char)
self.state = 0
self.setConsumer(StringBuilder(tok.value))
# Parser
def DECL(self, left, prods):
name, posarg = left
if name in self.grammarClass.tokenTypes():
raise GrammarError('"%s" is a token name and cannot be used as non-terminal' % name)
return (left, prods)
def LEFT(self, name, posarg=None):
return (name, posarg)
def PRODS1(self, prodlist):
return prodlist
def PRODS2(self, prods, prodlist):
prods.extend(prodlist)
return prods
def P1(self, sym, prodlist):
result = list()
symbol, properties, repeat, sep = sym
for prod in prodlist:
if prod.name is None:
if repeat is None:
prod.addSymbol(symbol, name=properties.get('name', None))
result.append(prod)
elif repeat == '?':
if sep is not None:
raise GrammarError('A separator makes no sense for "?"')
self.__addAtMostOne(result, prod, symbol, properties.get('name', None))
elif repeat in ['*', '+']:
self.__addList(result, prod, symbol, properties.get('name', None), repeat == '*', sep)
else:
result.append(prod)
return result
def __addAtMostOne(self, productions, prod, symbol, name):
clone = prod.cloned()
if name is not None:
self._wrapCallbackNone(name, clone)
productions.append(clone)
prod.addSymbol(symbol, name=name)
productions.append(prod)
def _wrapCallbackNone(self, name, prod):
previous = prod.callback
def callback(*args, **kwargs):
kwargs[name] = None
return previous(*args, **kwargs)
prod.callback = callback
def __addList(self, productions, prod, symbol, name, allowEmpty, sep):
class ListSymbol(metaclass=Singleton):
__reprval__ = 'List(%s, "%s")' % (symbol, '*' if allowEmpty else '+')
if allowEmpty:
clone = prod.cloned()
self._wrapCallbackEmpty(name, clone)
productions.append(clone)
prod.addSymbol(ListSymbol, name=name)
productions.append(prod)
listProd = Production(ListSymbol, self._wrapCallbackOne())
listProd.addSymbol(symbol, name='item')
productions.append(listProd)
listProd = Production(ListSymbol, self._wrapCallbackNext())
listProd.addSymbol(ListSymbol, name='items')
if sep is not None:
listProd.addSymbol(sep)
listProd.addSymbol(symbol, name='item')
productions.append(listProd)
def _wrapCallbackEmpty(self, name, prod):
previous = prod.callback
def cbEmpty(*args, **kwargs):
if name is not None:
kwargs[name] = []
return previous(*args, **kwargs)
prod.callback = cbEmpty
def _wrapCallbackOne(self):
def cbOne(_, item):
return [item]
return cbOne
def _wrapCallbackNext(self):
def cbNext(_, items, item):
items.append(item)
return items
return cbNext
def P2(self):
# 'name' is replaced in newSentence()
return [Production(None, self.callback, priority=self.priority, attributes=self.attributes)]
def SYMNAME1(self, identifier):
return identifier
def SYMNAME2(self, litteral):
name = litteral
if name not in self.grammarClass.tokenTypes():
self.grammarClass.addTokenType(name, lambda s, tok: None, re.escape(name), None)
return name
def SYM(self, symname, properties):
return (symname, properties, None, None)
def SYMREP(self, symname, repeat, properties, separator=None):
return (symname, properties, repeat, separator)
def SYMREP_LIT(self, symname, repeat, properties, separator):
if separator not in self.grammarClass.tokenTypes():
self.grammarClass.addTokenType(separator, lambda s, tok: None, re.escape(separator), None)
return self.SYMREP(symname, repeat, properties, separator)
def PROPERTIES1(self):
return dict()
def PROPERTIES2(self, name):
return dict(name=name)