diff --git a/.envrc b/.envrc new file mode 100644 index 0000000..f49b722 --- /dev/null +++ b/.envrc @@ -0,0 +1,2 @@ +PATH_add $(nix build --impure --expr "(import ).default.python.withPackages (ps: [ps.pip ps.pyparsing ps.hypothesis ps.pytest ps.pytest-cov ps.setuptools ps.ipython ps.pylint])" --no-link --print-out-paths)/bin +path_add PYTHONPATH $(pwd)/ diff --git a/README.md b/README.md index 809c2e6..244c42a 100644 --- a/README.md +++ b/README.md @@ -9,20 +9,18 @@ Dynamically construct python regex patterns. Say you want a regex pattern to match the initials of someones name. ```python -import re -from regexfactory import Amount, Range - +from regexfactory import * -pattern = Amount(Range("A", "Z"), 2, 3) +pattern = amount(Range("A", "Z"), 2, 3) -matches = pattern.findall( - "My initials are BDP. Valorie's are VO" -) +matches = pattern.findall("My initials are BDP. Valorie's are VO") +print(pattern.regex) print(matches) ``` -```bash +``` +[A-Z]{2,3} ['BDP', 'VO'] ``` @@ -31,36 +29,24 @@ print(matches) Or how matching both uppercase and lowercase hex strings in a sentence. ```python -import re from regexfactory import * -pattern = Optional("#") + Or( - Amount( - Set( - Range("0", "9"), - Range("a", "f") - ), - 6 - ), - Amount( - Set( - Range("0", "9"), - Range("A", "F") - ), - 6 - ), - +pattern = optional("#") + or_( + (Range("0", "9") | Range("a", "f")) * 6, + (Range("0", "9") | Range("A", "F")) * 6, ) sentence = """ My favorite color is #000000. I also like 5fb8a0. My second favorite color is #FF21FF. """ +print(pattern.regex) matches = pattern.findall(sentence) print(matches) ``` -```bash +``` +(?:#)?(?:[0-9a-f]{6}|[0-9A-F]{6}) ['#000000', '5fb8a0', '#FF21FF'] ``` @@ -71,30 +57,25 @@ Or what if you want to match urls in html content? ```python from regexfactory import * - -protocol = Amount(Range("a", "z"), 1, or_more=True) -host = Amount(Set(WORD, DIGIT, '.'), 1, or_more=True) -port = Optional(IfBehind(":") + Multi(DIGIT)) -path = Multi( - RegexPattern('/') + Multi( - NotSet('/', '#', '?', '&', WHITESPACE), - match_zero=True - ), - match_zero=True +protocol = amount(Range("a", "z"), 1, or_more=True) +host = amount(WORD | DIGIT | r"\.", 1, or_more=True) +port = optional(":" + multi(DIGIT)) +path = multi( + "/" + multi(NotSet("/", "#", "?", "&", WHITESPACE), match_zero=True), + match_zero=True, ) -patt = protocol + RegexPattern("://") + host + port + path - +patt = protocol + "://" + host + port + path sentence = "This is a cool url, https://github.com/GrandMoff100/RegexFactory/ " -print(patt) +print(patt.regex) print(patt.search(sentence)) ``` -```bash -[a-z]{1,}://[\w\d.]{1,}(?:\d{1,})?(/([^/#?&\s]{0,})){0,} - +``` +[a-z]+://[\w\d\.]+(?::\d+)?(?:/[^/\#\?\&\s]*)* + ``` ## The Pitch diff --git a/regexfactory/__init__.py b/regexfactory/__init__.py index e8bd145..0d236f0 100644 --- a/regexfactory/__init__.py +++ b/regexfactory/__init__.py @@ -23,10 +23,21 @@ WHITESPACE, WORD, ) -from .pattern import ESCAPED_CHARACTERS, RegexPattern, ValidPatternType, escape, join +from .pattern import ( + ESCAPED_CHARACTERS, + RegexPattern, + ValidPatternType, + amount, + escape, + join, + multi, + optional, + or_, +) from .patterns import ( Amount, Comment, + Concat, Extension, Group, IfAhead, @@ -37,12 +48,10 @@ Multi, NamedGroup, NamedReference, - NotSet, NumberedReference, Optional, Or, - Range, - Set, ) +from .sets import EMPTY, NEVER, NotSet, Range, Set __version__ = "1.0.1" diff --git a/regexfactory/chars.py b/regexfactory/chars.py index 8bde3bb..5fa10b2 100644 --- a/regexfactory/chars.py +++ b/regexfactory/chars.py @@ -8,30 +8,38 @@ """ from .pattern import RegexPattern +from .sets import CharClass #: (Dot.) In the default mode, this matches any character except a newline. If the :data:`re.DOTALL` flag has been specified, this matches any character including a newline. -ANY = RegexPattern(r".") +ANY = RegexPattern(r".", _precedence=10) +ANY._desc = "ANY" #: (Caret.) Matches the start of the string, and in :data:`re.MULTILINE` mode also matches immediately after each newline. -ANCHOR_START = RegexPattern(r"^", _precedence=2) +ANCHOR_START = RegexPattern(r"^", _precedence=0) #: Matches the end of the string or just before the newline at the end of the string, and in :data:`re.MULTILINE` mode also matches before a newline. foo matches both :code:`foo` and :code:`foobar`, while the regular expression :code:`foo$` matches only :code:`foo`. More interestingly, searching for :code:`foo.$` in :code:`foo1\nfoo2\n` matches :code:`foo2` normally, but :code:`foo1` in :data:`re.MULTILINE` mode; searching for a single $ in :code:`foo\n` will find two (empty) matches: one just before the newline, and one at the end of the string. -ANCHOR_END = RegexPattern(r"$", _precedence=2) +ANCHOR_END = RegexPattern(r"$", _precedence=0) #: Matches Unicode whitespace characters (which includes :code:`[ \t\n\r\f\v]`, and also many other characters, for example the non-breaking spaces mandated by typography rules in many languages). If the :data:`re.ASCII` flag is used, only :code:`[ \t\n\r\f\v]` is matched. -WHITESPACE = RegexPattern(r"\s") +WHITESPACE = CharClass(r"\s") +WHITESPACE._desc = "WHITESPACE" #: Matches any character which is not a whitespace character. This is the opposite of \s. If the :data:`re.ASCII` flag is used this becomes the equivalent of :code:`[^ \t\n\r\f\v]`. -NOTWHITESPACE = RegexPattern(r"\S") +NOTWHITESPACE = CharClass(r"\S") +NOTWHITESPACE._desc = "NOTWHITESPACE" #: Matches Unicode word characters; this includes most characters that can be part of a word in any language, as well as numbers and the underscore. If the :data:`re.ASCII` flag is used, only :code:`[a-zA-Z0-9_]` is matched. -WORD = RegexPattern(r"\w") +WORD = CharClass(r"\w") +WORD._desc = "WORD" #: Matches any character which is not a word character. This is the opposite of \w. If the :data:`re.ASCII` flag is used this becomes the equivalent of :code:`[^a-zA-Z0-9_]`. If the :data:`re.LOCALE` flag is used, matches characters which are neither alphanumeric in the current locale nor the underscore. -NOTWORD = RegexPattern(r"\W") +NOTWORD = CharClass(r"\W") +NOTWORD._desc = "NOTWORD" #: Matches any Unicode decimal digit (that is, any character in Unicode character category [Nd]). This includes :code:`[0-9]`, and also many other digit characters. If the :data:`re.ASCII` flag is used only :code:`[0-9]` is matched. -DIGIT = RegexPattern(r"\d") +DIGIT = CharClass(r"\d") +DIGIT._desc = "DIGIT" #: Matches any character which is not a decimal digit. This is the opposite of \d. If the :data:`re.ASCII` flag is used this becomes the equivalent of :code:`[^0-9]`. -NOTDIGIT = RegexPattern(r"\D") +NOTDIGIT = CharClass(r"\D") +NOTDIGIT._desc = "NOTDIGIT" diff --git a/regexfactory/pattern.py b/regexfactory/pattern.py index 7d428b4..12d3de1 100644 --- a/regexfactory/pattern.py +++ b/regexfactory/pattern.py @@ -7,8 +7,8 @@ # pylint: disable=cyclic-import - import re +import sys from typing import Any, Iterator, List, Optional, Tuple, Union #: @@ -17,18 +17,8 @@ #: Special characters that need to be escaped to be used without their special meanings. ESCAPED_CHARACTERS = "()[]{}?*+-|^$\\.&~#" - -def join(*patterns: ValidPatternType) -> "RegexPattern": - """Umbrella function for combining :class:`ValidPatternType`'s into a :class:`RegexPattern`.""" - joined = RegexPattern("") - for pattern in patterns: - joined += RegexPattern(pattern) - return joined - - -def escape(string: str) -> "RegexPattern": - """Escapes special characters in a string to use them without their special meanings.""" - return RegexPattern(re.escape(string)) +_enable_debug: bool = False +_enable_desc: bool = False class RegexPattern: @@ -37,69 +27,144 @@ class RegexPattern: """ regex: str + _reference_regex: Optional[str] = None + _desc: Optional[str] = None #: The precedence of the pattern. Higher precedence patterns are evaluated first. - # Precedence order here (https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html#tag_09_04_08) - precedence: int + # Precedence modified from here (https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html#tag_09_04_08) + + # 3 Bracket expression [] + # 2 Grouping () + # 1 Single-character-ERE duplication * + ? {m,n} + # 0 Concatenation, Anchoring ^ $ + # -2 Alternation | + _precedence: int + + def __init__(self, regex: str, *, _precedence: int = -10) -> None: + self.regex = regex + self._precedence = _precedence + + @property + def _get_ref(self): + if self._reference_regex is not None: + ans = self._reference_regex + else: + ans = self.regex + return f"(?:{ans})" + + @property + def _get_desc(self): + if self._desc is not None: + return self._desc + return repr(self.regex) + + @staticmethod + def from_regex_str(regex: str) -> "RegexPattern": + """create a RegexPattern from a regex. raises ValueError if regex is invalid.""" + + # attempt to treat regex as a literal string + if regex == re.escape(regex): + return escape(regex) + parts = regex.split(r"\\\\") + guess = "\\".join(x.replace("\\", "") for x in parts) + if regex == re.escape(guess): + return escape(guess) + + # https://stackoverflow.com/questions/19630994/how-to-check-if-a-string-is-a-valid-regex-in-python + try: + re.compile(regex) + except re.error as e: + raise ValueError(f"invalid regex {regex}") from e + ans = RegexPattern(regex, _precedence=-10) + if _enable_debug: + ans._reference_regex = regex + if _enable_desc: + ans._desc = f"RegexPattern.from_regex_str({regex!r})" + return ans + + @staticmethod + def create(obj: ValidPatternType) -> "RegexPattern": + """ + creats a RegexPattern from a str or RegexPattern. + if obj is a RegexPattern, its returned. + otherwise obj is treated as a regex. + """ + if isinstance(obj, RegexPattern): + return obj + if isinstance(obj, str): + return RegexPattern.from_regex_str(obj) + if isinstance(obj, re.Pattern): + return RegexPattern.from_regex_str(obj.pattern) + raise TypeError(f"Can't get regex from {obj.__class__.__qualname__} object.") + + @staticmethod + def _ensure_precedence( + pattern: ValidPatternType, precedence: int + ) -> "RegexPattern": + from .patterns import Group + + p = RegexPattern.create(pattern) - def __init__(self, pattern: ValidPatternType, /, _precedence: int = 1) -> None: - self.regex = self.get_regex(pattern) - self.precedence = ( - _precedence if not isinstance(pattern, RegexPattern) else pattern.precedence - ) + if p._precedence >= precedence: + return p + + assert precedence <= 2 + return Group(pattern, capturing=False) + + @staticmethod + def _ensure_precedence_fn(precedence: int): + def inner(pattern: ValidPatternType): + return RegexPattern._ensure_precedence(pattern, precedence).regex + + return inner def __repr__(self) -> str: raw_regex = f"{self.regex!r}".replace("\\\\", "\\") + if self._desc is not None: + return f"RegexPattern({self._desc}, {raw_regex}, {self._reference_regex})" return f"" def __str__(self) -> str: - return self.regex + return self._ensure_precedence(self, 2).regex def __add__(self, other: ValidPatternType) -> "RegexPattern": """Adds two :class:`ValidPatternType`'s together, into a :class:`RegexPattern`""" - from .patterns import Group # pylint: disable=import-outside-toplevel - - try: - other_pattern = ( - RegexPattern(other) if not isinstance(other, RegexPattern) else other - ) - except TypeError: - return NotImplemented - - if self.precedence > other_pattern.precedence: - return RegexPattern( - self.regex + self.get_regex(Group(other_pattern, capturing=False)) - ) - if self.precedence < other_pattern.precedence: - return RegexPattern( - self.get_regex(Group(self, capturing=False)) + other_pattern.regex - ) - return RegexPattern(self.regex + other_pattern.regex) + return join(self, other) def __radd__(self, other: ValidPatternType) -> "RegexPattern": """Adds two :class:`ValidPatternType`'s together, into a :class:`RegexPattern`""" - from .patterns import Group # pylint: disable=import-outside-toplevel - - try: - other_pattern = ( - RegexPattern(other) if not isinstance(other, RegexPattern) else other - ) - except TypeError: - return NotImplemented - - if self.precedence > other_pattern.precedence: - return RegexPattern( - self.get_regex(Group(other_pattern, capturing=False)) + self.regex - ) - if self.precedence < other_pattern.precedence: - return RegexPattern( - other_pattern.regex + self.get_regex(Group(self, capturing=False)) - ) - return RegexPattern(other_pattern.regex + self.regex) + return join(other, self) def __mul__(self, coefficient: int) -> "RegexPattern": - """Treats :class:`RegexPattern` as a string and multiplies it by an integer.""" - return RegexPattern(self.regex * coefficient) + """matches exactly coefficient counts of self""" + + return amount(self, coefficient) + + def __getitem__(self, arg: slice) -> "RegexPattern": + if isinstance(arg, slice): + assert arg.step is None + if isinstance(arg.start, int): + start = arg.start + else: + assert arg.start is None + start = 0 + if isinstance(arg.stop, int): + end = arg.stop + or_more = False + else: + assert arg.stop is None + end = None + or_more = True + + return amount(self, start, end, or_more=or_more) + + raise ValueError(f"{arg}") + + def __or__(self, other: ValidPatternType): + return or_(self, other) + + def __ror__(self, other): + return or_(other, self) def __eq__(self, other: Any) -> bool: """ @@ -107,30 +172,13 @@ def __eq__(self, other: Any) -> bool: Otherwise return false. """ if isinstance(other, (str, re.Pattern, RegexPattern)): - return ( - self.regex == RegexPattern(other).regex - and self.precedence == RegexPattern(other).precedence - ) + return self.regex == self.create(other).regex return super().__eq__(other) def __hash__(self) -> int: """Hashes the regex string.""" return hash(self.regex) - @staticmethod - def get_regex(obj: ValidPatternType, /) -> str: - """ - Extracts the regex content from :class:`RegexPattern` or :class:`re.Pattern` objects - else return the input :class:`str`. - """ - if isinstance(obj, RegexPattern): - return obj.regex - if isinstance(obj, str): - return obj - if isinstance(obj, re.Pattern): - return obj.pattern - raise TypeError(f"Can't get regex from {obj.__class__.__qualname__} object.") - def compile( self, *, @@ -219,9 +267,174 @@ def search( content: str, /, pos: int = 0, - endpos: int = 0, + endpos: int = sys.maxsize, *, flags: int = 0, ) -> Optional[re.Match]: """See :meth:`re.Pattern.search`.""" return self.compile(flags=flags).search(content, pos, endpos) + + +def join(*patterns: ValidPatternType) -> RegexPattern: + """Umbrella function for combining :class:`ValidPatternType`'s into a :class:`RegexPattern`.""" + from .patterns import Concat + + ps = [RegexPattern.create(p) for p in patterns] + ans = Concat(*ps) + if _enable_debug: + ans._reference_regex = "".join(x._get_ref for x in ps) + if _enable_desc: + ans._desc = "+".join(x._get_desc for x in ps) + return ans + + +def escape(string: str) -> RegexPattern: + """Escapes special characters in a string to use them without their special meanings.""" + ans = _escape(string) + + if _enable_debug: + ans._reference_regex = re.escape(string) + if _enable_desc: + if re.escape(string) == string: + ans._desc = repr(string) + else: + ans._desc = f"escape({repr(string)})" + return ans + + +def _escape(string: str) -> RegexPattern: + if len(string) == 0: + from .sets import EMPTY + + return EMPTY + if len(string) == 1: + from .sets import CharLiteral + + return CharLiteral(re.escape(string)) + return RegexPattern(re.escape(string), _precedence=0) + + +def or_(*args: ValidPatternType) -> RegexPattern: + """ + matches any one of args. args is tried from left to right. + """ + from .patterns import Or + from .sets import NEVER, Set, _is_charset + + args_rx = [RegexPattern.create(x) for x in args] + + if len(args_rx) == 1: + ans = args_rx[0] + else: + all_cs = all(map(_is_charset, args_rx)) + + if all_cs: + ans = Set(*args_rx) # type: ignore + else: + ans = Or(*args_rx) + + if _enable_debug: + if len(args_rx) > 0: + ans._reference_regex = "|".join(x._get_ref for x in args_rx) + else: + ans._reference_regex = NEVER.regex + if _enable_desc: + desc = ",".join(x._get_desc for x in args_rx) + ans._desc = f"or_({desc})" + return ans + + +def amount( + pattern: ValidPatternType, + i: int, + j: Optional[int] = None, + *, + or_more: bool = False, + greedy: bool = True, +) -> RegexPattern: + """ + (1) amount(pattern, i, j): + matches between i and j, inclusive, copies of pattern + (2) amount(pattern, i): + matches exactly i copies of pattern + (3) amount(pattern, i, or_more = True): + matches at least i copies of pattern + + if greedy is True, match as long as possible; othewise match as short as possible. + backtracking is always enabled, regardless of the setting of greedy. + """ + + if j is not None and or_more: + raise ValueError() + + from .patterns import Amount + + pt = RegexPattern.create(pattern) + + ans = _amount(pt, i, j, or_more, greedy) + if _enable_debug: + ans._reference_regex = Amount( + RegexPattern(pt._get_ref, _precedence=2).regex, i, j, or_more, greedy + ).regex + if _enable_desc: + pieces = [pt._get_desc, str(i)] + if j is not None: + pieces.append(str(j)) + if or_more: + pieces.append("or_more=True") + if not greedy: + pieces.append("greedy=False") + ans._desc = f"Amount({','.join(pieces)})" + return ans + + +def _amount( + pattern: RegexPattern, i: int, j: Optional[int], or_more: bool, greedy: bool +) -> RegexPattern: + from .patterns import Amount, Multi + from .patterns import Optional as POpt + from .sets import EMPTY + + if j is None and not or_more: + j = i + + if i == 0 and j == 0: + return EMPTY + + if i == 1 and j == 1: + return pattern + + if i == 0 and j == 1: + return POpt(pattern, greedy) + + if i == 0 and j is None and or_more: + return Multi(pattern, True, greedy) + + if i == 1 and j is None and or_more: + return Multi(pattern, False, greedy) + + if i == j: + j = None + + return Amount(pattern, i, j, or_more, greedy) + + +def multi( + pattern: ValidPatternType, + match_zero: bool = False, + greedy: bool = True, +): + """ + maches one or more counts of pattern. + if match_zero=True, match zero or more instead. + """ + if match_zero: + return amount(pattern, 0, or_more=True, greedy=greedy) + return amount(pattern, 1, or_more=True, greedy=greedy) + + +def optional(pattern: ValidPatternType, greedy: bool = True): + """ + maches zero or one counts of pattern. + """ + return amount(pattern, 0, 1, greedy=greedy) diff --git a/regexfactory/patterns.py b/regexfactory/patterns.py index 69bd993..66b7ce0 100644 --- a/regexfactory/patterns.py +++ b/regexfactory/patterns.py @@ -11,6 +11,18 @@ from regexfactory.pattern import RegexPattern, ValidPatternType +class Concat(RegexPattern): + """ + a concatation of patterns. + """ + + def __init__(self, *patterns: ValidPatternType) -> None: + super().__init__( + "".join(map(self._ensure_precedence_fn(0), patterns)), + _precedence=0, + ) + + class Or(RegexPattern): """ For matching multiple patterns. @@ -28,124 +40,10 @@ class Or(RegexPattern): """ - def __init__( - self, - *patterns: ValidPatternType, - ) -> None: - regex = "|".join( - map( - self.get_regex, - ( - Group( - pattern, - capturing=False, - ) - for pattern in patterns - ), - ) - ) - super().__init__((regex)) - - -class Range(RegexPattern): - """ - For matching characters between two character indices - (using the Unicode numbers of the input characters.) - You can find use :func:`chr` and :func:`ord` - to translate characters their Unicode numbers and back again. - For example, :code:`chr(97)` returns the string :code:`'a'`, - while :code:`chr(8364)` returns the string :code:`'€'` - Thus, matching characters between :code:`'a'` and :code:`'z'` - is really checking whether a characters unicode number - is between :code:`ord('a')` and :code:`ord('z')` - - .. exec_code:: - - from regexfactory import Range, Or - - patt = Or("Bob", Range("a", "z")) - - print(patt.findall("my job is working for Bob")) - - """ - - def __init__(self, start: str, stop: str) -> None: - self.start = start - self.stop = stop - regex = f"[{start}-{stop}]" - super().__init__(regex) - - -class Set(RegexPattern): - """ - For matching a single character from a list of characters. - Keep in mind special characters like :code:`+` and :code:`.` - lose their meanings inside a set/list, - so need to escape them here to use them. - - In practice, :code:`Set("a", ".", "z")` - functions the same as :code:`Or("a", ".", "z")` - The difference being that :class:`Or` accepts :class:`RegexPattern` 's - and :class:`Set` accepts characters only. - Special characters do **NOT** lose their special meaings inside an :class:`Or` though. - The other big difference is performance, - :class:`Or` is a lot slower than :class:`Set`. - - .. exec_code:: - - import time - from regexfactory import Or, Set - - start_set = time.time() - print(patt := Set(*"a.z").compile()) - print("Set took", time.time() - start_set, "seconds to compile") - print("And the resulting match is", patt.match("b")) - - print() - - start_or = time.time() - print(patt := Or(*"a.z").compile()) - print("Or took", time.time() - start_or, "seconds to compile") - print("And the resulting match is", patt.match("b")) - - """ - def __init__(self, *patterns: ValidPatternType) -> None: - regex = "" - for pattern in patterns: - if isinstance(pattern, Range): - regex += f"{pattern.start}-{pattern.stop}" - else: - regex += self.get_regex(pattern) - super().__init__(f"[{regex}]") - - -class NotSet(RegexPattern): - """ - For matching a character that is **NOT** in a list of characters. - Keep in mind special characters lose their special meanings inside :class:`NotSet`'s as well. - - .. exec_code:: - - from regexfactory import NotSet, Set - - not_abc = NotSet(*"abc") - - is_abc = Set(*"abc") - - print(not_abc.match("x")) - print(is_abc.match("x")) - - """ - - def __init__(self, *patterns: ValidPatternType) -> None: - regex = "" - for pattern in patterns: - if isinstance(pattern, Range): - regex += f"{pattern.start}-{pattern.stop}" - else: - regex += self.get_regex(pattern) - super().__init__(f"[^{regex}]") + super().__init__( + "|".join(map(self._ensure_precedence_fn(-2), patterns)), _precedence=-2 + ) class Amount(RegexPattern): @@ -184,6 +82,7 @@ class Amount(RegexPattern): """ + # pylint: disable=too-many-arguments def __init__( self, pattern: ValidPatternType, @@ -193,13 +92,20 @@ def __init__( greedy: bool = True, ) -> None: if j is not None: + assert not or_more amount = f"{i},{j}" elif or_more: amount = f"{i}," else: amount = f"{i}" - regex = self.get_regex(pattern) + "{" + amount + "}" + ("" if greedy else "?") - super().__init__(regex) + regex = ( + self._ensure_precedence(pattern, 2).regex + + "{" + + amount + + "}" + + ("" if greedy else "?") + ) + super().__init__(regex, _precedence=1) class Multi(RegexPattern): @@ -217,8 +123,8 @@ def __init__( suffix = "*" if match_zero else "+" if greedy is False: suffix += "?" - regex = self.get_regex(Group(pattern, capturing=False)) - super().__init__(regex + suffix) + regex = self._ensure_precedence(pattern, 2).regex + super().__init__(regex + suffix, _precedence=1) class Optional(RegexPattern): @@ -227,17 +133,19 @@ class Optional(RegexPattern): Functions the same as :code:`Amount(pattern, 0, 1)`. """ - def __init__(self, pattern: ValidPatternType, greedy: bool = True) -> None: - regex = Group(pattern, capturing=False) + "?" + ("" if greedy else "?") - super().__init__(regex) + def __init__(self, pattern: ValidPatternType, greedy: bool = True): + regex = ( + self._ensure_precedence(pattern, 2).regex + "?" + ("" if greedy else "?") + ) + super().__init__(regex, _precedence=1) class Extension(RegexPattern): """Base class for extension pattern classes.""" def __init__(self, prefix: str, pattern: ValidPatternType): - regex = self.get_regex(pattern) - super().__init__(f"(?{prefix}{regex})") + regex = self.create(pattern).regex + super().__init__(f"(?{prefix}{regex})", _precedence=2) class NamedGroup(Extension): @@ -444,6 +352,7 @@ def __init__(self, pattern: ValidPatternType, capturing: bool = True) -> None: RegexPattern.__init__( # pylint: disable=non-parent-init-called self, f"({pattern})", + _precedence=2, ) diff --git a/regexfactory/sets.py b/regexfactory/sets.py new file mode 100644 index 0000000..0a46a00 --- /dev/null +++ b/regexfactory/sets.py @@ -0,0 +1,192 @@ +""" +Charset Subclasses +************************ + +Charset is any pattern that matches exactly single character, out of a set. + +for example, :code:`'a'`, :code:`'[ab]'` or :code:`r'[a\\w]'` + +Charset gets special support. for example +:code:`or_("a", "b")` returns `` +""" + +# pylint: disable=cyclic-import + +import itertools +import re +import typing as t + +from .pattern import RegexPattern +from .patterns import IfNotAhead + +CharSet = t.Union[str, "Set", "CharLiteral", "CharClass", "Range"] + +# matches nothing, always succeeds +EMPTY = RegexPattern("", _precedence=0) +EMPTY._desc = "EMPTY" + +# always fails +NEVER = IfNotAhead(EMPTY) +NEVER._desc = "NEVER" + + +def _is_charset(c: RegexPattern) -> t.TypeGuard[CharSet]: + return isinstance(c, (Set, CharLiteral, CharClass, Range)) + + +def _charset_normalize(c: CharSet) -> list[str]: + if isinstance(c, str): + return [re.escape(x) for x in c] + if isinstance(c, Range): + return [c._regex_inner] + if isinstance(c, Set): + return c.members + if isinstance(c, CharLiteral): + return [c.regex] + if isinstance(c, CharClass): + return [c.regex] + + raise TypeError(f"invalid charset: {c!r}") + + +class CharLiteral(RegexPattern): + """ + matches a single char literal like "A" + """ + + def __init__(self, c: str, _precedence=10): + super().__init__(c, _precedence=_precedence) + + +class CharClass(RegexPattern): + """ + base class for patterns that matches exactly one from a set of characters, + for example DIGIT or WORD + """ + + def __init__(self, c: str, _precedence=10): + super().__init__(c, _precedence=_precedence) + + +class Range(RegexPattern): + """ + For matching characters between two character indices + (using the Unicode numbers of the input characters.) + You can find use :func:`chr` and :func:`ord` + to translate characters their Unicode numbers and back again. + For example, :code:`chr(97)` returns the string :code:`'a'`, + while :code:`chr(8364)` returns the string :code:`'€'` + Thus, matching characters between :code:`'a'` and :code:`'z'` + is really checking whether a characters unicode number + is between :code:`ord('a')` and :code:`ord('z')` + + .. exec_code:: + + from regexfactory import Range, Or + + patt = Or("Bob", Range("a", "z")) + + print(patt.findall("my job is working for Bob")) + + """ + + def __init__(self, start: str, stop: str) -> None: + if len(start) != 1: + raise ValueError(f"invalid start: {start!r}") + if len(stop) != 1: + raise ValueError(f"invalid stop: {stop!r}") + if ord(stop) < ord(start): + raise ValueError(f"invalid range: {start!r} to {stop!r}") + + self.start = start + self.stop = stop + + self._regex_inner = f"{re.escape(start)}-{re.escape(stop)}" + super().__init__(f"[{self._regex_inner}]", _precedence=3) + + +class Set(RegexPattern): + """ + For matching a single character from a list of characters. + Keep in mind special characters like :code:`+` and :code:`.` + lose their meanings inside a set/list, + so need to escape them here to use them. + + In practice, :code:`Set("a", ".", "z")` + functions the same as :code:`Or("a", ".", "z")` + The difference being that :class:`Or` accepts :class:`RegexPattern` 's + and :class:`Set` accepts characters only. + Special characters do **NOT** lose their special meaings inside an :class:`Or` though. + The other big difference is performance, + :class:`Or` is a lot slower than :class:`Set`. + + .. exec_code:: + + import time + from regexfactory import Or, Set + + start_set = time.time() + print(patt := Set(*"a.z").compile()) + print("Set took", time.time() - start_set, "seconds to compile") + print("And the resulting match is", patt.match("b")) + + print() + + start_or = time.time() + print(patt := Or(*"a.z").compile()) + print("Or took", time.time() - start_or, "seconds to compile") + print("And the resulting match is", patt.match("b")) + + """ + + members: list[str] = [] + + def __init__(self, *charsets: CharSet) -> None: + self.members = list( + itertools.chain(*(_charset_normalize(arg) for arg in charsets)) + ) + if len(self.members) == 0: + regex = NEVER.regex + prec = NEVER._precedence + else: + regex = "[" + "".join(self.members) + "]" + prec = 3 + + super().__init__(regex, _precedence=prec) + + +class NotSet(RegexPattern): + """ + For matching a character that is **NOT** in a list of characters. + Keep in mind special characters lose their special meanings inside :class:`NotSet`'s as well. + + .. exec_code:: + + from regexfactory import NotSet, Set + + not_abc = NotSet(*"abc") + + is_abc = Set(*"abc") + + print(not_abc.match("x")) + print(is_abc.match("x")) + + """ + + members: list[str] = [] + + def __init__(self, *charsets: CharSet): + self.members = list( + itertools.chain(*(_charset_normalize(arg) for arg in charsets)) + ) + if len(self.members) == 0: + from .chars import NOTWHITESPACE, WHITESPACE + + any_sing = WHITESPACE | NOTWHITESPACE + regex = any_sing.regex + prec = any_sing._precedence + else: + regex = "[^" + "".join(self.members) + "]" + prec = 3 + + super().__init__(regex, _precedence=prec) diff --git a/setup.cfg b/setup.cfg index c2e4b6b..723143c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -47,4 +47,4 @@ per-file-ignores = __init__.py:F401, conf.py:E402 profile = black [pylint.messages_control] -disable = too-few-public-methods, too-many-arguments, line-too-long +disable = too-few-public-methods, too-many-arguments, line-too-long, import-outside-toplevel, protected-access diff --git a/tests/conftest.py b/tests/conftest.py index e5c6d22..2e063f4 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,4 +1,8 @@ -from hypothesis import settings +from hypothesis import Verbosity, settings + +from regexfactory import pattern # Set default profile to use 500 examples -settings.register_profile("default", max_examples=500) +settings.register_profile("default", max_examples=500, verbosity=Verbosity.normal) + +pattern._enable_desc = True diff --git a/tests/strategies.py b/tests/strategies.py index 74d52c6..35ce969 100644 --- a/tests/strategies.py +++ b/tests/strategies.py @@ -1,22 +1,101 @@ +from hypothesis import event from hypothesis import strategies as st -from regexfactory.pattern import ESCAPED_CHARACTERS +import regexfactory as r +from regexfactory import NotSet, Range, RegexPattern, amount, escape, or_ -# Strategy to generate characters that are not used in escapes -non_escape_char = st.characters(blacklist_characters=list(ESCAPED_CHARACTERS)) +@st.composite +def pat_base(draw: st.DrawFn) -> RegexPattern: + v = draw(st.integers(0, 99)) + v = 99 - v -# Strategy to generate text that avoids escaped characters -non_escaped_text = st.text(min_size=1, alphabet=non_escape_char) + def case(weight: int): + nonlocal v + v -= weight + return v < 0 + if case(10): + charsets = [ + r.ANY, + r.ANCHOR_START, + r.ANCHOR_END, + r.WHITESPACE, + r.NOTWHITESPACE, + r.WORD, + r.NOTWORD, + r.DIGIT, + r.NOTDIGIT, + ] + return draw(st.sampled_from(charsets)) -# Strategy to produce either None or a positive integer -optional_step = st.one_of(st.none(), st.integers(min_value=1)) + if case(10): + chars = draw(st.lists(elements=st.characters(codec="utf-8"))) + return or_(*(escape(x) for x in chars)) + if case(5): + chars = draw(st.lists(elements=st.characters(codec="utf-8"))) + return NotSet(*chars) -def build_bounds(lower_bound, step) -> range: - """ - Function to generate a tuple of (lower, upper) in which lower < upper - """ - upper_bound = lower_bound + step - return range(lower_bound, upper_bound + 1) + if case(5): + x = draw(st.characters(codec="utf-8")) + y = draw(st.characters(codec="utf-8")) + if x > y: + x, y = y, x + return Range(x, y) + + return escape(draw(st.text())) + + +@st.composite +def _pat_extend(draw: st.DrawFn, children: st.SearchStrategy[RegexPattern]): + if not draw(st.booleans()): + return draw(children) + draw(children) + + if not draw(st.booleans()): + return draw(children) | draw(children) + + c = draw(children) + + if draw(st.booleans()): + return amount( + c, + draw(st.integers(0, 4)), + or_more=draw(st.booleans()), + greedy=draw(st.booleans()), + ) + + a = draw(st.integers(0, 2)) + b = draw(st.integers(0, 2)) + return amount(c, a, a + b, or_more=False, greedy=draw(st.booleans())) + + +pat_generic: st.SearchStrategy[RegexPattern] = st.recursive(pat_base(), _pat_extend) + + +always_case = ["", "a", "ab", "aa", ".", " ", "\t", "\n"] + + +@st.composite +def _gencase_random(draw: st.DrawFn, r1: str, r2: str): + # use_true_random prevent shrinks + x = draw(st.randoms(use_true_random=True)).randint(0, 9) + + # use st.text() most of the time since its much faster + if x < 8: + return draw(st.text()) + + event("generating test case using st.from_regex") + if x < 9: + ans = draw(st.from_regex(r1)) + else: + ans = draw(st.from_regex(r2)) + + event("st.from_regex succeeded") + return ans + + +def gencase(r1: str, r2: str) -> st.SearchStrategy[str]: + """generate a str suitable for checking if r1 and r2 behave the same on that str""" + # allow shrinking towards always or text() + return st.sampled_from(always_case) | st.text() | _gencase_random(r1, r2) diff --git a/tests/test_amount.py b/tests/test_amount.py index 7d96735..659d116 100644 --- a/tests/test_amount.py +++ b/tests/test_amount.py @@ -1,89 +1,80 @@ -from typing import Optional - -import pytest from hypothesis import given from hypothesis import strategies as st -from strategies import build_bounds, optional_step - -from regexfactory import Amount, ValidPatternType - - -def build_amount( - pattern: ValidPatternType, - start: int, - or_more: bool, - greedy: bool, - step: Optional[int], -): - """ - General Amount builder. Note that the `j` parameter is constructed as - the step plus start when step is defined. When step is None we assume - that no upper bound is present. - """ - stop_value = None - if step is not None: - stop_value = start + step - return Amount( - pattern=pattern, i=start, j=stop_value, or_more=or_more, greedy=greedy +from strategies import pat_generic +from utils import check_one + +from regexfactory import amount, join, multi, optional + + +@given(pat_generic, st.integers(0, 4), st.data()) +def test_operator_mul(x, n, data): + check_one(x * n, join(*(x for _ in range(n))), data) + + +# invariants of amount + + +@given(pat_generic, st.integers(0, 4), st.booleans(), st.data()) +def test_amount_fixed1(x, n, greedy, data): + check_one( + amount(x, n, n, greedy=greedy), + x * n, + data, + ) + + +@given(pat_generic, st.integers(0, 4), st.booleans(), st.data()) +def test_amount_fixed2(x, n, greedy, data): + check_one( + amount(x, n, greedy=greedy), + x * n, + data, + ) + + +@given(pat_generic, st.integers(0, 4), st.integers(0, 4), st.booleans(), st.data()) +def test_amount_bounded(x, n, m, greedy, data): + check_one( + amount(x, n, n + m, greedy=greedy), + x * n + (optional(x, greedy=greedy)) * m, + data, + ) + + +@given(pat_generic, st.integers(0, 4), st.booleans(), st.data()) +def test_amount_or_more1(x, n, greedy, data): + check_one( + amount(x, n, or_more=True, greedy=greedy), + x * n + amount(x, 0, or_more=True, greedy=greedy), + data, + ) + + +@given(pat_generic, st.booleans(), st.data()) +def test_amount_or_more2(x, greedy, data): + check_one( + amount(x, 0, or_more=True, greedy=greedy), + optional(x, greedy=greedy) + amount(x, 0, or_more=True, greedy=greedy), + data, ) -@pytest.mark.patterns -@given(st.text(min_size=1), st.integers(min_value=1)) -def test_amount_single_count(word, count): - """ - Test to ensure that when `or_more=False` and no upper bound is - provided the regex will be of the form word{count}. - """ - actual = Amount(word, i=count, or_more=False) - assert actual.regex == "{word}{{{count}}}".format(word=word, count=str(count)) - - -@pytest.mark.patterns -@given( - st.text(min_size=1), - st.builds( - build_bounds, - lower_bound=st.integers(min_value=1), - step=st.integers(min_value=1), - ), -) -def test_amount_lower_upper(word, bound: range): - """ - Test to ensure that if a lower and upper bound are provided then the - regex of the resulting `Amount` will be of the form {word}{lower,upper}. - """ - actual = Amount(word, bound.start, bound.stop) - expected = "{word}{{{lower},{upper}}}".format( - word=word, lower=str(bound.start), upper=str(bound.stop) +# multi is consistent with + and * + + +@given(pat_generic, st.data()) +def test_multi1(x, data): + check_one( + multi(x), + f"(?:{x.regex})+", + data, ) - assert actual.regex == expected - - -@pytest.mark.patterns -@given(st.text(min_size=1), st.integers(min_value=1)) -def test_amount_or_more(word, count): - """ - Test to ensure that when `or_more=True` and no upper bound is - provided the regex will be of the form word{count,}. - """ - actual = Amount(word, count, or_more=True) - assert actual.regex == "{word}{{{count},}}".format(word=word, count=str(count)) - - -@pytest.mark.patterns -@given( - st.builds( - build_amount, - pattern=st.text(min_size=1), - start=st.integers(min_value=1), - or_more=st.booleans(), - greedy=st.just(False), - step=optional_step, + + +@given(pat_generic, st.data()) +def test_multi2(x, data): + check_one( + multi(x, match_zero=True), + f"(?:{x.regex})*", + data, ) -) -def test_amount_non_greedy(amt): - """ - Test to ensure that instances of Amount with greedy as False will end with "?" - """ - assert amt.regex.endswith("?") diff --git a/tests/test_escape.py b/tests/test_escape.py new file mode 100644 index 0000000..c38f40d --- /dev/null +++ b/tests/test_escape.py @@ -0,0 +1,18 @@ +import re + +from hypothesis import given +from hypothesis import strategies as st +from strategies import pat_generic +from utils import check_one + +from regexfactory import RegexPattern, escape + + +@given(pat_generic, st.data()) +def test_from_regex_str(x, data): + check_one(x, RegexPattern.from_regex_str(x.regex), data) + + +@given(st.text(), st.data()) +def test_escape_str(x, data): + check_one(escape(x), re.escape(x), data) diff --git a/tests/test_join.py b/tests/test_join.py index 727cdf6..18e83be 100644 --- a/tests/test_join.py +++ b/tests/test_join.py @@ -1,18 +1,21 @@ -import pytest -from hypothesis import example, given +from hypothesis import given from hypothesis import strategies as st -from strategies import non_escaped_text +from strategies import pat_generic +from utils import check_one -from regexfactory.pattern import join +from regexfactory import escape, join -@pytest.mark.pattern -@given(st.lists(elements=non_escaped_text, min_size=1, max_size=10, unique=True)) -@example(words=["0", "1"]) -def test_join(words: list): - """ - Tests to capture that the join function concatenates the expressions and - each word in the list is found in the larger regex. - """ - joined_regex = join(*words) - assert joined_regex.regex == "".join(words) +@given(pat_generic, pat_generic, st.data()) +def test_operator_add(x, y, data): + check_one(x + y, join(x, y), data) + + +@given(pat_generic, pat_generic, pat_generic, st.data()) +def test_join_assoc(x, y, z, data): + check_one(x + (y + z), (x + y) + z, data) + + +@given(st.text(), st.text(), st.data()) +def test_sum_escape(x, y, data): + check_one(escape(x + y), escape(x) + escape(y), data) diff --git a/tests/test_manual.py b/tests/test_manual.py new file mode 100644 index 0000000..dfeedc0 --- /dev/null +++ b/tests/test_manual.py @@ -0,0 +1,29 @@ +from hypothesis import given +from hypothesis import strategies as st + +from regexfactory import escape, optional + + +def check(a, b: bool): + assert (a is not None) == b + + +@given(st.text(), st.text()) +def test_anchor_start(a: str, b: str): + check(("^" + escape(a)).search(b), b.startswith(a)) + + +@given(st.text(), st.text()) +def test_anchor_end(a: str, b: str): + check((escape(a) + "$").search(b), b.endswith(a)) + + +@given(st.text(), st.text()) +def test_optional(a: str, b: str): + x = optional(escape(a)).search(a + b) + assert x is not None + assert x.group() == a + + x = optional(escape(a), greedy=False).search(a + b) + assert x is not None + assert x.group() == "" diff --git a/tests/test_or.py b/tests/test_or.py index b1b6579..782d98e 100644 --- a/tests/test_or.py +++ b/tests/test_or.py @@ -1,26 +1,47 @@ -import re - import pytest -from hypothesis import example, given +from hypothesis import given from hypothesis import strategies as st -from strategies import non_escaped_text +from strategies import pat_generic +from utils import check_one, check_regex + +from regexfactory import RegexPattern, escape, or_ + + +@given(pat_generic, pat_generic, st.data()) +def test_operator_or(x, y, data): + check_one(x | y, or_(x, y), data) + + +@given(pat_generic, pat_generic, pat_generic, st.data()) +def test_or_assoc(x, y, z, data): + check_one(x | (y | z), (x | y) | z, data) + + +@given(pat_generic, pat_generic, pat_generic, st.data()) +def test_join_or1(x, y, z, data): + check_one( + (x | y) + z, + (x + z) | (y + z), + data, + ) + + +def test_join_or2b(): + # above does not pass the other way. ex: + with pytest.raises(RuntimeError): + x = or_("ax", "a") + y = escape("xb") + z = escape("bc") -from regexfactory import Or + check_regex(x + (y | z), (x + y) | (x + z), "axbc") -@pytest.mark.patterns -@given( - st.lists( - non_escaped_text, - min_size=1, - max_size=10, +@given(pat_generic, pat_generic, pat_generic, st.data()) +def test_join_or2a(x: RegexPattern, y, z, data): + # should work if x is only allowed to match one thing + x = RegexPattern(f"(?>{x.regex})") + check_one( + x + (y | z), + (x + y) | (x + z), + data, ) -) -@example(arr=["0", "0"]) -def test_matching_or(arr: list): - actual = Or(*arr) - if len(arr) == 1: - assert isinstance(actual.match(arr[0]), re.Match) - else: - for value in arr: - assert isinstance(actual.match(value), re.Match) diff --git a/tests/test_range.py b/tests/test_range.py deleted file mode 100644 index b969b61..0000000 --- a/tests/test_range.py +++ /dev/null @@ -1,25 +0,0 @@ -import pytest - -from regexfactory import Range - - -@pytest.mark.patterns -def test_numeric_range(): - start = "0" - end = "9" - assert Range(start, end).regex == "[0-9]" - - -@pytest.mark.patterns -@pytest.mark.parametrize( - "start, stop, expected", - [ - ("0", "9", "[0-9]"), - ("a", "f", "[a-f]"), - ("r", "q", "[r-q]"), - ("A", "Z", "[A-Z]"), - ], -) -def test_range_parameters(start, stop, expected): - actual = Range(start=start, stop=stop) - assert actual.regex == expected diff --git a/tests/test_reference.py b/tests/test_reference.py new file mode 100644 index 0000000..98e0962 --- /dev/null +++ b/tests/test_reference.py @@ -0,0 +1,24 @@ +from hypothesis import HealthCheck, assume, given, settings +from hypothesis import strategies as st +from strategies import gencase, pat_generic +from utils import check_regex + +from regexfactory import RegexPattern, pattern + +pattern._enable_debug = True + + +# checks that ._reference_regex behaves the same as .regex +@given(pat_generic, st.data()) +@settings( + max_examples=10000, suppress_health_check=[HealthCheck.too_slow], deadline=None +) +def test_reference(pat: RegexPattern, data: st.DataObject): + assume(pat._reference_regex is not None) + assert pat._reference_regex is not None + assume(pat.regex != pat._reference_regex) + check_regex( + pat.regex, + pat._reference_regex, + data.draw(gencase(pat.regex, pat._reference_regex)), + ) diff --git a/tests/test_set.py b/tests/test_set.py index d400c8e..8de1455 100644 --- a/tests/test_set.py +++ b/tests/test_set.py @@ -1,16 +1,44 @@ import re -import pytest from hypothesis import given from hypothesis import strategies as st -from strategies import non_escape_char +from utils import check_one -from regexfactory import Set +import regexfactory as r +from regexfactory import IfNotAhead, NotSet, Range, Set, or_ -@pytest.mark.patterns -@given(st.lists(elements=non_escape_char, min_size=1)) -def test_set(chars: list): - actual = Set(*chars) - for value in chars: - assert isinstance(actual.match(value), re.Match) +@given(st.data()) +def test_range(data): + check_one( + Range("a", "e"), + Set("abcde"), + data, + ) + + +@given(st.text(), st.data()) +def test_set(chars, data): + check_one( + Set(*chars), + or_(*(re.escape(x) for x in chars)), + data, + ) + + +@given(st.text(), st.data()) +def test_notset(chars, data): + check_one( + NotSet(*chars), + IfNotAhead(Set(*chars)) + NotSet(), + data, + ) + + +@given(st.data()) +def test_any(data): + check_one( + or_(r.ANY, r.WHITESPACE), + NotSet(), + data, + ) diff --git a/tests/utils.py b/tests/utils.py new file mode 100644 index 0000000..90b790b --- /dev/null +++ b/tests/utils.py @@ -0,0 +1,43 @@ +import re + +from hypothesis import strategies as st + +from regexfactory import RegexPattern, ValidPatternType + + +def check_regex(p1: ValidPatternType, p2: ValidPatternType, v: str): + r1 = p1 if isinstance(p1, str) else RegexPattern.create(p1).regex + r2 = p2 if isinstance(p2, str) else RegexPattern.create(p2).regex + + # checks that r1 and r2 does the same on v + if r1 == r2: + return + + x = re.compile(r1).search(v) + y = re.compile(r2).search(v) + + try: + assert (x is None) == (y is None) + + if x is not None: + assert y is not None + assert x.start() == y.start() + assert x.end() == y.end() + except Exception as e: + raise RuntimeError( + f"string {v!r} results in:\n" + f"r1: {r1!r}\n" + f"=> {x!r}\n" + f"r2: {r2!r}\n" + f"=> {y!r}\n" + ) from e + + +def check_one(v1: ValidPatternType, v2: ValidPatternType, data: st.DataObject): + from strategies import gencase + + r1 = RegexPattern.create(v1).regex + r2 = RegexPattern.create(v2).regex + if r1 == r2: + return + check_regex(r1, r2, data.draw(gencase(r1, r2)))