"""
lexington.regex
===============
Lexington's lexers operate using the derivative of a regular expression.
Python's regular expressions as implemented in the `re` module are not
actually regular expressions, and probably wouldn't give you access to their
derivatives even if they were. So, this implementation is necessary.
:copyright: (C) 2013, Matthew Frazier
:license: Released under the MIT/X11 license, see LICENSE for details
"""
from __future__ import unicode_literals
from abc import ABCMeta, abstractmethod, abstractproperty
from collections import Sequence
from .strings import Strings, Characters, native_strings, n, string_type
### Very scary metaprogramming ###
class _RegexClass(ABCMeta):
"""
This is a metaclass, and a giant hack. It has two primary functions:
First, it obviates the needs to decorate every `__repr__` method with
`~lexington.strings.native_strings`.
Second, because it doesn't make sense to instantiate `Regex` directly,
it's convenient to use it as a factory for actual `Regex` subclasses.
However, the behavior of `__new__` can be a bit confusing.
By overriding `__call__` in the metaclass directly, we can prevent the
whole "`__new__`/`__init__`" stack from entering the picture when using
`Regex` as a factory function.
"""
def __new__(mcls, name, bases, namespace):
# __new__ is called on the metaclass when creating a class.
# It has the chance to modify the class's namespace before the
# class is actually created.
# We use this as a chance to wrap the __repr__ method.
if '__repr__' in namespace:
namespace['__repr__'] = native_strings(namespace['__repr__'])
return super(_RegexClass, mcls).__new__(mcls, name, bases, namespace)
def __call__(cls, *args, **kwargs):
# A class is just an object, and a metaclass is the type of that
# object. So, when you do AClass(...), it calls the __call__ method
# on that class, just like any other object.
if cls is Regex:
return regexify(*args, **kwargs)
else:
return super(_RegexClass, cls).__call__(*args, **kwargs)
_Regex = _RegexClass(n("_Regex"), (object,), dict(
__doc__ = "This is needed for Python 3 compatibility. "
"The metaclass syntax changed between Python 3 and Python 2, "
"so we need to construct a base class programmatically.",
__slots__ = ()
))
### Actual regular expression classes ###
[docs]class Regex(_Regex):
"""
This is the abstract base class for elements of regular expressions.
(It's also used as a factory for converting mundane Python data types
like strings into regular expressions.)
`Regex` objects are immutable, hashable, and comparable.
In practice, `Regex`'s subclasses should be regarded as implementation
details. You shouldn't attempt to create instances of them, create new
subclasses of `Regex`, or test that a regex is an instance of a
particular `Regex` subclass.
:param e: An expression to convert into a regular expression.
(This is equivalent to `regexify`.)
"""
__metaclass__ = _RegexClass
__slots__ = ()
### Abstractions to override
@abstractmethod
[docs] def derive(self, sym):
"""
Returns the derivative of this regular expression with respect to
a symbol. You can view it as::
{s[1:] for s in languages_generated_by(self) if s and s[0] == sym}
:param sym: The symbol to derive this regular expression with regards
to.
"""
pass
@abstractproperty
[docs] def accepts_empty_string(self):
"""
Indicates whether this regular expression will accept the empty
string.
"""
pass
@abstractproperty
[docs] def alphabet(self):
"""
Indicates the alphabet of the strings this regular expression can
match. `None` indicates that this regular expression is
independent of alphabet.
The alphabet will usually be `~lexington.strings.Text` or
`~lexington.strings.Bytestring`.
"""
pass
def __eq__(self, other):
return type(self) is type(other) and hash(self) == hash(other)
@abstractmethod
def __hash__(self):
pass
### High-level regex operations
[docs] def match(self, subject):
"""
Determines whether the `subject` matches this regex. This performs a
total match -- if you want the behavior of `re.match`, which only
matches a prefix, use `match_prefix`.
This returns `True` if the match succeeds, and `False` if not.
:param subject: The string to match against this regex.
"""
re = self
for sym in subject:
re = re.derive(sym)
if re is Null:
return False
return re.accepts_empty_string
@property
[docs] def literal(self):
"""
If this regular expression matches a literal string exactly, this
property contains that string. Otherwise, it will be `None`.
(You can generally only assume that regexes constructed *from* literal
strings will have this property.)
"""
return None
### Operator overloads and convenience methods
[docs] def star(self):
"""
Creates a regular expression that accepts this one repeated any
number of times. (Equivalent to the `~lexington.regex.star` function.)
"""
return star(self)
[docs] def plus(self):
"""
Creates a regular expression that accepts this one repeated any
number of times.
"""
return concat(self, star(self))
[docs] def maybe(self):
"""
Creates a regular expression that accepts this one, or the empty
string.
"""
return union(self, Epsilon)
def __add__(self, suffix):
return concat(self, suffix)
def __radd__(self, prefix):
return concat(prefix, self)
def __or__(self, other):
return union(self, other)
def __ror__(self, other):
return union(other, self)
def __pow__(self, count):
return repeat(self, count)
class EpsilonRegex(Regex):
"""
A regular expression that matches the empty string.
"""
__slots__ = ()
def derive(self, sym):
return Null
accepts_empty_string = True
alphabet = None
def __repr__(self):
return "Epsilon"
def __hash__(self):
return hash(type(self))
class NullRegex(Regex):
"""
A regular expression that doesn't match any strings, even the empty
string.
"""
__slots__ = ()
def derive(self, sym):
return self
accepts_empty_string = False
alphabet = None
def __repr__(self):
return "Null"
def __hash__(self):
return hash(type(self))
class SymbolRegex(Regex):
"""
A regular expression that matches a particular symbol.
:param sym: The symbol to match.
"""
__slots__ = ('sym')
def __init__(self, sym):
self.sym = sym
def derive(self, sym):
return Epsilon if sym == self.sym else Null
accepts_empty_string = False
@property
def alphabet(self):
return string_type(self.sym)
@property
def literal(self):
return self.sym
def __repr__(self):
return "Regex(%r)" % self.sym
def __hash__(self):
return hash((id(type(self)), self.sym))
class AnySymbolRegex(Regex):
"""
A regular expression that matches ANY symbol (but not the lack of one).
Equivalent to ``.`` in Python's regex notation.
:param sym: The symbol to match.
"""
__slots__ = ()
def derive(self, sym):
return Epsilon
accepts_empty_string = False
alphabet = None
def __repr__(self):
return "Any"
def __hash__(self):
return hash(type(self))
class UnionRegex(Regex):
"""
A regular expression that will match any of multiple options.
:param options: The regular expressions to accept.
"""
__slots__ = ('options', 'alphabet')
def __init__(self, options):
self.alphabet = None
self.options = frozenset(options)
for opt in self.options:
if opt.alphabet is not None:
if self.alphabet is None:
self.alphabet = opt.alphabet
elif opt.alphabet is not self.alphabet:
raise TypeError(n("Cannot mix alphabets %r and %r in "
"union" %
(self.alphabet, opt.alphabet)))
def derive(self, sym):
return union(*(r.derive(sym) for r in self.options))
@property
def accepts_empty_string(self):
return any(r.accepts_empty_string for r in self.options)
def __repr__(self):
return " | ".join(repr(r) for r in self.options)
def __hash__(self):
return hash((id(type(self)), self.options))
class ConcatRegex(Regex):
"""
A regular expression that matches two regular expressions in a row.
"""
__slots__ = ('prefix', 'suffix', 'alphabet')
def __init__(self, prefix, suffix):
self.prefix = prefix
self.suffix = suffix
# This logic is admittedly a bit twisty. The idea is:
# If the prefix and suffix are alphabet-independent, so is this.
# If the prefix and suffix have the same alphabet, or one has an
# alphabet and the other doesn't, this will have the same alphabet.
# If the prefix and suffix have different alphabets, that's an error.
alpha_pre = prefix.alphabet
alpha_suf = suffix.alphabet
if alpha_pre is not None or alpha_suf is not None:
if alpha_pre is not alpha_suf:
raise TypeError(n("Cannot concatenate alphabets %r and %r" %
(alpha_pre, alpha_suf)))
self.alphabet = alpha_pre if alpha_suf is None else alpha_suf
else:
self.alphabet = None
def derive(self, sym):
if self.prefix.accepts_empty_string:
return union(concat(self.prefix.derive(sym), self.suffix),
self.suffix.derive(sym))
else:
return concat(self.prefix.derive(sym), self.suffix)
@property
def accepts_empty_string(self):
return (self.prefix.accepts_empty_string and
self.suffix.accepts_empty_string)
@property
def literal(self):
if self.prefix.literal and self.suffix.literal:
return self.prefix.literal + self.suffix.literal
else:
return None
def __repr__(self):
if self.literal:
return "Regex(%r)" % self.literal
else:
return "%r + %r" % (self.prefix, self.suffix)
def __hash__(self):
return hash((id(type(self)), self.prefix, self.suffix))
class StarRegex(Regex):
"""
A regular expression that will match a certain regex, repeated any number
of times.
:param regex: The regular expression describing the strings to repeat.
"""
__slots__ = ('regex')
def __init__(self, regex):
self.regex = regex
def derive(self, sym):
return concat(self.regex.derive(sym), self)
accepts_empty_string = True
@property
def alphabet(self):
return self.regex.alphabet
def __repr__(self):
return "star(%r)" % self.regex
def __hash__(self):
return hash((id(type(self)), self.regex))
class RepeatRegex(Regex):
"""
A regular expression that will match a certain regex, repeated a specific
number of times.
:param regex: The regular expression describing the strings to repeat.
:param count: The number of times to repeat it.
"""
__slots__ = ('regex', 'count')
def __init__(self, regex, count):
if count < 2:
raise ValueError("Repeat must be greater than 1" % count)
self.regex = regex
self.count = count
def derive(self, sym):
return concat(self.regex.derive(sym),
repeat(self.regex, self.count - 1))
accepts_empty_string = False
@property
def alphabet(self):
return self.regex.alphabet
def __repr__(self):
return "%r ** %d" % (self.regex, self.count)
def __hash__(self):
return hash((id(type(self)), self.regex, self.count))
### Regex constructors ###
[docs]def regexify(e):
"""
Converts a Python object to a `Regex`. If it's already a `Regex`, it just
returns it. It will also accept any string or character type, and create
a regex that matches that exactly.
:param e: The Python object to create a regex of.
"""
if isinstance(e, Regex):
return e
elif isinstance(e, Strings):
if len(e) == 0:
return Epsilon
return join(SymbolRegex(sym) for sym in e)
elif isinstance(e, Characters):
return SymbolRegex(e)
else:
raise TypeError(n("Instances of %r can't be automatically converted "
"to regular expressions" % type(e)))
#: A regular expression that matches any single symbol.
Any = AnySymbolRegex()
#: A regular expression that matches the empty string.
Epsilon = EpsilonRegex()
#: A regular expression that refuses to accept *anything* -- even the
#: empty string.
Null = NullRegex()
[docs]def union(*options):
"""
Creates a regular expression that accepts *any* of the following regexes.
:param options: The regular expressions to accept.
"""
# A list comprehension would be cleaner, but we need to be able to check
# the value *after* processing to leave out Nulls.
s = set()
for regex in options:
if isinstance(regex, UnionRegex):
s.update(regex.options)
else:
regex = regexify(regex)
if regex is not Null:
s.add(regex)
if not s:
return Null
elif len(s) == 1:
return s.pop()
else:
return UnionRegex(s)
[docs]def concat(prefix, suffix):
"""
Concatenates two regular expressions, such that `prefix` will be matched,
then `suffix` will be matched after it is complete.
:param prefix: The first regular expression.
:param suffix: The second regular expression.
"""
prefix = regexify(prefix)
suffix = regexify(suffix)
if prefix is Null or suffix is Null:
return Null
elif prefix is Epsilon:
return suffix
elif suffix is Epsilon:
return prefix
else:
return ConcatRegex(prefix, suffix)
[docs]def join(regexes):
"""
Concatenates a sequence of regular expressions, such that each one
will be matched successively. (Note that a `Null` value anywhere in the
sequence will result in a `Null` overall.) A zero-length sequence
will return `Epsilon`.
:param regexes: The regular expressions to concatenate.
"""
if not isinstance(regexes, Sequence):
regexes = tuple(regexes)
n = len(regexes)
if n == 2:
return concat(regexes[0], regexes[1])
elif n == 1:
return regexify(regexes[0])
elif n == 0:
return Epsilon
else:
# This is a right fold (`reduce` backwards).
# [-2::-1] is slice for "start at the next-to-last and go in reverse."
final = regexes[-1]
for regex in regexes[-2::-1]:
final = concat(regex, final)
# Break early on Null.
if final is Null:
return Null
return final
[docs]def star(regex):
"""
Creates a regular expression that accepts `regex`, repeated any number
of times -- even 0.
:param regex: The regular expression describing the strings to repeat.
"""
if regex is Epsilon:
return Epsilon
elif regex is Null:
return Epsilon
elif isinstance(regex, StarRegex):
# r* == r**, so we can avoid wrapping it again and wasting time.
return regex
else:
return StarRegex(regexify(regex))
[docs]def repeat(regex, count):
"""
Creates a regular expression that accepts `regex` repeated a specific
number of times.
:param regex: The regular expression describing the strings to reepeat.
:param count: The number of times to repeat it.
"""
if count == 0:
return Epsilon
elif count == 1:
return regex
elif regex is Epsilon:
return Epsilon
elif regex is Null:
return Null
else:
return RepeatRegex(regexify(regex), count)