Source code for lexington.regex

"""
lexington.regex
===============
Lexington's lexers operate using the derivative of a regular expression.
Python's regular expressions as implemented in the `re` module are not
actually regular expressions, and probably wouldn't give you access to their
derivatives even if they were. So, this implementation is necessary.

:copyright: (C) 2013, Matthew Frazier
:license:   Released under the MIT/X11 license, see LICENSE for details
"""
from __future__ import unicode_literals
from abc import ABCMeta, abstractmethod, abstractproperty
from collections import Sequence
from .strings import Strings, Characters, native_strings, n, string_type


### Very scary metaprogramming ###

class _RegexClass(ABCMeta):
    """
    This is a metaclass, and a giant hack. It has two primary functions:

    First, it obviates the needs to decorate every `__repr__` method with
    `~lexington.strings.native_strings`.

    Second, because it doesn't make sense to instantiate `Regex` directly,
    it's convenient to use it as a factory for actual `Regex` subclasses.
    However, the behavior of `__new__` can be a bit confusing.
    By overriding `__call__` in the metaclass directly, we can prevent the
    whole "`__new__`/`__init__`" stack from entering the picture when using
    `Regex` as a factory function.
    """
    def __new__(mcls, name, bases, namespace):
        # __new__ is called on the metaclass when creating a class.
        # It has the chance to modify the class's namespace before the
        # class is actually created.
        # We use this as a chance to wrap the __repr__ method.
        if '__repr__' in namespace:
            namespace['__repr__'] = native_strings(namespace['__repr__'])
        return super(_RegexClass, mcls).__new__(mcls, name, bases, namespace)

    def __call__(cls, *args, **kwargs):
        # A class is just an object, and a metaclass is the type of that
        # object. So, when you do AClass(...), it calls the __call__ method
        # on that class, just like any other object.
        if cls is Regex:
            return regexify(*args, **kwargs)
        else:
            return super(_RegexClass, cls).__call__(*args, **kwargs)


_Regex = _RegexClass(n("_Regex"), (object,), dict(
    __doc__ = "This is needed for Python 3 compatibility. "
              "The metaclass syntax changed between Python 3 and Python 2, "
              "so we need to construct a base class programmatically.",
    __slots__ = ()
))


### Actual regular expression classes ###


[docs]class Regex(_Regex):
    """
    This is the abstract base class for elements of regular expressions.
    (It's also used as a factory for converting mundane Python data types
    like strings into regular expressions.)

    `Regex` objects are immutable, hashable, and comparable.

    In practice, `Regex`'s subclasses should be regarded as implementation
    details. You shouldn't attempt to create instances of them, create new
    subclasses of `Regex`, or test that a regex is an instance of a
    particular `Regex` subclass.

    :param e: An expression to convert into a regular expression.
              (This is equivalent to `regexify`.)
    """
    __metaclass__ = _RegexClass
    __slots__ = ()

    ### Abstractions to override

    @abstractmethod
[docs]    def derive(self, sym):
        """
        Returns the derivative of this regular expression with respect to
        a symbol. You can view it as::

            {s[1:] for s in languages_generated_by(self) if s and s[0] == sym}

        :param sym: The symbol to derive this regular expression with regards
                    to.
        """
        pass

    @abstractproperty
[docs]    def accepts_empty_string(self):
        """
        Indicates whether this regular expression will accept the empty
        string.
        """
        pass

    @abstractproperty
[docs]    def alphabet(self):
        """
        Indicates the alphabet of the strings this regular expression can
        match. `None` indicates that this regular expression is
        independent of alphabet.

        The alphabet will usually be `~lexington.strings.Text` or
        `~lexington.strings.Bytestring`.
        """
        pass

    def __eq__(self, other):
        return type(self) is type(other) and hash(self) == hash(other)

    @abstractmethod
    def __hash__(self):
        pass

    ### High-level regex operations

[docs]    def match(self, subject):
        """
        Determines whether the `subject` matches this regex. This performs a
        total match -- if you want the behavior of `re.match`, which only
        matches a prefix, use `match_prefix`.

        This returns `True` if the match succeeds, and `False` if not.

        :param subject: The string to match against this regex.
        """
        re = self
        for sym in subject:
            re = re.derive(sym)
            if re is Null:
                return False
        return re.accepts_empty_string

    @property
[docs]    def literal(self):
        """
        If this regular expression matches a literal string exactly, this
        property contains that string. Otherwise, it will be `None`.
        (You can generally only assume that regexes constructed *from* literal
        strings will have this property.)
        """
        return None

    ### Operator overloads and convenience methods

[docs]    def star(self):
        """
        Creates a regular expression that accepts this one repeated any
        number of times. (Equivalent to the `~lexington.regex.star` function.)
        """
        return star(self)

[docs]    def plus(self):
        """
        Creates a regular expression that accepts this one repeated any
        number of times.
        """
        return concat(self, star(self))

[docs]    def maybe(self):
        """
        Creates a regular expression that accepts this one, or the empty
        string.
        """
        return union(self, Epsilon)

    def __add__(self, suffix):
        return concat(self, suffix)

    def __radd__(self, prefix):
        return concat(prefix, self)

    def __or__(self, other):
        return union(self, other)

    def __ror__(self, other):
        return union(other, self)

    def __pow__(self, count):
        return repeat(self, count)


class EpsilonRegex(Regex):
    """
    A regular expression that matches the empty string.
    """
    __slots__ = ()

    def derive(self, sym):
        return Null

    accepts_empty_string = True

    alphabet = None

    def __repr__(self):
        return "Epsilon"

    def __hash__(self):
        return hash(type(self))


class NullRegex(Regex):
    """
    A regular expression that doesn't match any strings, even the empty
    string.
    """
    __slots__ = ()

    def derive(self, sym):
        return self

    accepts_empty_string = False

    alphabet = None

    def __repr__(self):
        return "Null"

    def __hash__(self):
        return hash(type(self))


class SymbolRegex(Regex):
    """
    A regular expression that matches a particular symbol.

    :param sym: The symbol to match.
    """
    __slots__ = ('sym')

    def __init__(self, sym):
        self.sym = sym

    def derive(self, sym):
        return Epsilon if sym == self.sym else Null

    accepts_empty_string = False

    @property
    def alphabet(self):
        return string_type(self.sym)

    @property
    def literal(self):
        return self.sym

    def __repr__(self):
        return "Regex(%r)" % self.sym

    def __hash__(self):
        return hash((id(type(self)), self.sym))


class AnySymbolRegex(Regex):
    """
    A regular expression that matches ANY symbol (but not the lack of one).
    Equivalent to ``.`` in Python's regex notation.

    :param sym: The symbol to match.
    """
    __slots__ = ()

    def derive(self, sym):
        return Epsilon

    accepts_empty_string = False

    alphabet = None

    def __repr__(self):
        return "Any"

    def __hash__(self):
        return hash(type(self))


class UnionRegex(Regex):
    """
    A regular expression that will match any of multiple options.

    :param options: The regular expressions to accept.
    """
    __slots__ = ('options', 'alphabet')

    def __init__(self, options):
        self.alphabet = None
        self.options = frozenset(options)
        for opt in self.options:
            if opt.alphabet is not None:
                if self.alphabet is None:
                    self.alphabet = opt.alphabet
                elif opt.alphabet is not self.alphabet:
                    raise TypeError(n("Cannot mix alphabets %r and %r in "
                                      "union" %
                                      (self.alphabet, opt.alphabet)))

    def derive(self, sym):
        return union(*(r.derive(sym) for r in self.options))

    @property
    def accepts_empty_string(self):
        return any(r.accepts_empty_string for r in self.options)

    def __repr__(self):
        return " | ".join(repr(r) for r in self.options)

    def __hash__(self):
        return hash((id(type(self)), self.options))


class ConcatRegex(Regex):
    """
    A regular expression that matches two regular expressions in a row.
    """
    __slots__ = ('prefix', 'suffix', 'alphabet')

    def __init__(self, prefix, suffix):
        self.prefix = prefix
        self.suffix = suffix

        # This logic is admittedly a bit twisty. The idea is:
        # If the prefix and suffix are alphabet-independent, so is this.
        # If the prefix and suffix have the same alphabet, or one has an
        # alphabet and the other doesn't, this will have the same alphabet.
        # If the prefix and suffix have different alphabets, that's an error.
        alpha_pre = prefix.alphabet
        alpha_suf = suffix.alphabet
        if alpha_pre is not None or alpha_suf is not None:
            if alpha_pre is not alpha_suf:
                raise TypeError(n("Cannot concatenate alphabets %r and %r" %
                                  (alpha_pre, alpha_suf)))
            self.alphabet = alpha_pre if alpha_suf is None else alpha_suf
        else:
            self.alphabet = None

    def derive(self, sym):
        if self.prefix.accepts_empty_string:
            return union(concat(self.prefix.derive(sym), self.suffix),
                         self.suffix.derive(sym))
        else:
            return concat(self.prefix.derive(sym), self.suffix)

    @property
    def accepts_empty_string(self):
        return (self.prefix.accepts_empty_string and
                self.suffix.accepts_empty_string)

    @property
    def literal(self):
        if self.prefix.literal and self.suffix.literal:
            return self.prefix.literal + self.suffix.literal
        else:
            return None

    def __repr__(self):
        if self.literal:
            return "Regex(%r)" % self.literal
        else:
            return "%r + %r" % (self.prefix, self.suffix)

    def __hash__(self):
        return hash((id(type(self)), self.prefix, self.suffix))


class StarRegex(Regex):
    """
    A regular expression that will match a certain regex, repeated any number
    of times.

    :param regex: The regular expression describing the strings to repeat.
    """
    __slots__ = ('regex')

    def __init__(self, regex):
        self.regex = regex

    def derive(self, sym):
        return concat(self.regex.derive(sym), self)

    accepts_empty_string = True

    @property
    def alphabet(self):
        return self.regex.alphabet

    def __repr__(self):
        return "star(%r)" % self.regex

    def __hash__(self):
        return hash((id(type(self)), self.regex))


class RepeatRegex(Regex):
    """
    A regular expression that will match a certain regex, repeated a specific
    number of times.

    :param regex: The regular expression describing the strings to repeat.
    :param count: The number of times to repeat it.
    """
    __slots__ = ('regex', 'count')

    def __init__(self, regex, count):
        if count < 2:
            raise ValueError("Repeat must be greater than 1" % count)
        self.regex = regex
        self.count = count

    def derive(self, sym):
        return concat(self.regex.derive(sym),
                      repeat(self.regex, self.count - 1))

    accepts_empty_string = False

    @property
    def alphabet(self):
        return self.regex.alphabet

    def __repr__(self):
        return "%r ** %d" % (self.regex, self.count)

    def __hash__(self):
        return hash((id(type(self)), self.regex, self.count))


### Regex constructors ###


[docs]def regexify(e):
    """
    Converts a Python object to a `Regex`. If it's already a `Regex`, it just
    returns it. It will also accept any string or character type, and create
    a regex that matches that exactly.

    :param e: The Python object to create a regex of.
    """
    if isinstance(e, Regex):
        return e
    elif isinstance(e, Strings):
        if len(e) == 0:
            return Epsilon
        return join(SymbolRegex(sym) for sym in e)
    elif isinstance(e, Characters):
        return SymbolRegex(e)
    else:
        raise TypeError(n("Instances of %r can't be automatically converted "
                          "to regular expressions" % type(e)))


#: A regular expression that matches any single symbol.
Any = AnySymbolRegex()

#: A regular expression that matches the empty string.
Epsilon = EpsilonRegex()

#: A regular expression that refuses to accept *anything* -- even the
#: empty string.
Null = NullRegex()


[docs]def union(*options):
    """
    Creates a regular expression that accepts *any* of the following regexes.

    :param options: The regular expressions to accept.
    """
    # A list comprehension would be cleaner, but we need to be able to check
    # the value *after* processing to leave out Nulls.
    s = set()
    for regex in options:
        if isinstance(regex, UnionRegex):
            s.update(regex.options)
        else:
            regex = regexify(regex)
            if regex is not Null:
                s.add(regex)
    if not s:
        return Null
    elif len(s) == 1:
        return s.pop()
    else:
        return UnionRegex(s)


[docs]def concat(prefix, suffix):
    """
    Concatenates two regular expressions, such that `prefix` will be matched,
    then `suffix` will be matched after it is complete.

    :param prefix: The first regular expression.
    :param suffix: The second regular expression.
    """
    prefix = regexify(prefix)
    suffix = regexify(suffix)
    if prefix is Null or suffix is Null:
        return Null
    elif prefix is Epsilon:
        return suffix
    elif suffix is Epsilon:
        return prefix
    else:
        return ConcatRegex(prefix, suffix)


[docs]def join(regexes):
    """
    Concatenates a sequence of regular expressions, such that each one
    will be matched successively. (Note that a `Null` value anywhere in the
    sequence will result in a `Null` overall.) A zero-length sequence
    will return `Epsilon`.

    :param regexes: The regular expressions to concatenate.
    """
    if not isinstance(regexes, Sequence):
        regexes = tuple(regexes)

    n = len(regexes)
    if n == 2:
        return concat(regexes[0], regexes[1])
    elif n == 1:
        return regexify(regexes[0])
    elif n == 0:
        return Epsilon
    else:
        # This is a right fold (`reduce` backwards).
        # [-2::-1] is slice for "start at the next-to-last and go in reverse."
        final = regexes[-1]
        for regex in regexes[-2::-1]:
            final = concat(regex, final)
            # Break early on Null.
            if final is Null:
                return Null
        return final


[docs]def star(regex):
    """
    Creates a regular expression that accepts `regex`, repeated any number
    of times -- even 0.

    :param regex: The regular expression describing the strings to repeat.
    """
    if regex is Epsilon:
        return Epsilon
    elif regex is Null:
        return Epsilon
    elif isinstance(regex, StarRegex):
        # r* == r**, so we can avoid wrapping it again and wasting time.
        return regex
    else:
        return StarRegex(regexify(regex))


[docs]def repeat(regex, count):
    """
    Creates a regular expression that accepts `regex` repeated a specific
    number of times.

    :param regex: The regular expression describing the strings to reepeat.
    :param count: The number of times to repeat it.
    """
    if count == 0:
        return Epsilon
    elif count == 1:
        return regex
    elif regex is Epsilon:
        return Epsilon
    elif regex is Null:
        return Null
    else:
        return RepeatRegex(regexify(regex), count)
Source code for lexington.regex

Project Versions

This Page

Navigation

Source code for lexington.regex

Project Versions

RTD Search

This Page

Quick search

Navigation