Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .envrc
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
PATH_add $(nix build --impure --expr "(import <df>).default.python.withPackages (ps: [ps.pip ps.pyparsing ps.hypothesis ps.pytest ps.pytest-cov ps.setuptools ps.ipython ps.pylint])" --no-link --print-out-paths)/bin
path_add PYTHONPATH $(pwd)/
65 changes: 23 additions & 42 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,20 +9,18 @@ Dynamically construct python regex patterns.
Say you want a regex pattern to match the initials of someones name.

```python
import re
from regexfactory import Amount, Range

from regexfactory import *

pattern = Amount(Range("A", "Z"), 2, 3)
pattern = amount(Range("A", "Z"), 2, 3)

matches = pattern.findall(
"My initials are BDP. Valorie's are VO"
)
matches = pattern.findall("My initials are BDP. Valorie's are VO")

print(pattern.regex)
print(matches)
```

```bash
```
[A-Z]{2,3}
['BDP', 'VO']
```

Expand All @@ -31,36 +29,24 @@ print(matches)
Or how matching both uppercase and lowercase hex strings in a sentence.

```python
import re
from regexfactory import *

pattern = Optional("#") + Or(
Amount(
Set(
Range("0", "9"),
Range("a", "f")
),
6
),
Amount(
Set(
Range("0", "9"),
Range("A", "F")
),
6
),

pattern = optional("#") + or_(
(Range("0", "9") | Range("a", "f")) * 6,
(Range("0", "9") | Range("A", "F")) * 6,
)

sentence = """
My favorite color is #000000. I also like 5fb8a0. My second favorite color is #FF21FF.
"""

print(pattern.regex)
matches = pattern.findall(sentence)
print(matches)
```

```bash
```
(?:#)?(?:[0-9a-f]{6}|[0-9A-F]{6})
['#000000', '5fb8a0', '#FF21FF']
```

Expand All @@ -71,30 +57,25 @@ Or what if you want to match urls in html content?
```python
from regexfactory import *


protocol = Amount(Range("a", "z"), 1, or_more=True)
host = Amount(Set(WORD, DIGIT, '.'), 1, or_more=True)
port = Optional(IfBehind(":") + Multi(DIGIT))
path = Multi(
RegexPattern('/') + Multi(
NotSet('/', '#', '?', '&', WHITESPACE),
match_zero=True
),
match_zero=True
protocol = amount(Range("a", "z"), 1, or_more=True)
host = amount(WORD | DIGIT | r"\.", 1, or_more=True)
port = optional(":" + multi(DIGIT))
path = multi(
"/" + multi(NotSet("/", "#", "?", "&", WHITESPACE), match_zero=True),
match_zero=True,
)
patt = protocol + RegexPattern("://") + host + port + path

patt = protocol + "://" + host + port + path


sentence = "This is a cool url, https://github.com/GrandMoff100/RegexFactory/ "
print(patt)
print(patt.regex)

print(patt.search(sentence))
```

```bash
[a-z]{1,}://[\w\d.]{1,}(?:\d{1,})?(/([^/#?&\s]{0,})){0,}
<re.Match object; span=(15, 51), match='https://github.com/GrandMoff100/RegexFactory/'>
```
[a-z]+://[\w\d\.]+(?::\d+)?(?:/[^/\#\?\&\s]*)*
<re.Match object; span=(20, 65), match='https://github.com/GrandMoff100/RegexFactory/'>
```

## The Pitch
Expand Down
17 changes: 13 additions & 4 deletions regexfactory/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,21 @@
WHITESPACE,
WORD,
)
from .pattern import ESCAPED_CHARACTERS, RegexPattern, ValidPatternType, escape, join
from .pattern import (
ESCAPED_CHARACTERS,
RegexPattern,
ValidPatternType,
amount,
escape,
join,
multi,
optional,
or_,
)
from .patterns import (
Amount,
Comment,
Concat,
Extension,
Group,
IfAhead,
Expand All @@ -37,12 +48,10 @@
Multi,
NamedGroup,
NamedReference,
NotSet,
NumberedReference,
Optional,
Or,
Range,
Set,
)
from .sets import EMPTY, NEVER, NotSet, Range, Set

__version__ = "1.0.1"
26 changes: 17 additions & 9 deletions regexfactory/chars.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,30 +8,38 @@
"""

from .pattern import RegexPattern
from .sets import CharClass

#: (Dot.) In the default mode, this matches any character except a newline. If the :data:`re.DOTALL` flag has been specified, this matches any character including a newline.
ANY = RegexPattern(r".")
ANY = RegexPattern(r".", _precedence=10)
ANY._desc = "ANY"

#: (Caret.) Matches the start of the string, and in :data:`re.MULTILINE` mode also matches immediately after each newline.
ANCHOR_START = RegexPattern(r"^", _precedence=2)
ANCHOR_START = RegexPattern(r"^", _precedence=0)

#: Matches the end of the string or just before the newline at the end of the string, and in :data:`re.MULTILINE` mode also matches before a newline. foo matches both :code:`foo` and :code:`foobar`, while the regular expression :code:`foo$` matches only :code:`foo`. More interestingly, searching for :code:`foo.$` in :code:`foo1\nfoo2\n` matches :code:`foo2` normally, but :code:`foo1` in :data:`re.MULTILINE` mode; searching for a single $ in :code:`foo\n` will find two (empty) matches: one just before the newline, and one at the end of the string.
ANCHOR_END = RegexPattern(r"$", _precedence=2)
ANCHOR_END = RegexPattern(r"$", _precedence=0)

#: Matches Unicode whitespace characters (which includes :code:`[ \t\n\r\f\v]`, and also many other characters, for example the non-breaking spaces mandated by typography rules in many languages). If the :data:`re.ASCII` flag is used, only :code:`[ \t\n\r\f\v]` is matched.
WHITESPACE = RegexPattern(r"\s")
WHITESPACE = CharClass(r"\s")
WHITESPACE._desc = "WHITESPACE"

#: Matches any character which is not a whitespace character. This is the opposite of \s. If the :data:`re.ASCII` flag is used this becomes the equivalent of :code:`[^ \t\n\r\f\v]`.
NOTWHITESPACE = RegexPattern(r"\S")
NOTWHITESPACE = CharClass(r"\S")
NOTWHITESPACE._desc = "NOTWHITESPACE"

#: Matches Unicode word characters; this includes most characters that can be part of a word in any language, as well as numbers and the underscore. If the :data:`re.ASCII` flag is used, only :code:`[a-zA-Z0-9_]` is matched.
WORD = RegexPattern(r"\w")
WORD = CharClass(r"\w")
WORD._desc = "WORD"

#: Matches any character which is not a word character. This is the opposite of \w. If the :data:`re.ASCII` flag is used this becomes the equivalent of :code:`[^a-zA-Z0-9_]`. If the :data:`re.LOCALE` flag is used, matches characters which are neither alphanumeric in the current locale nor the underscore.
NOTWORD = RegexPattern(r"\W")
NOTWORD = CharClass(r"\W")
NOTWORD._desc = "NOTWORD"

#: Matches any Unicode decimal digit (that is, any character in Unicode character category [Nd]). This includes :code:`[0-9]`, and also many other digit characters. If the :data:`re.ASCII` flag is used only :code:`[0-9]` is matched.
DIGIT = RegexPattern(r"\d")
DIGIT = CharClass(r"\d")
DIGIT._desc = "DIGIT"

#: Matches any character which is not a decimal digit. This is the opposite of \d. If the :data:`re.ASCII` flag is used this becomes the equivalent of :code:`[^0-9]`.
NOTDIGIT = RegexPattern(r"\D")
NOTDIGIT = CharClass(r"\D")
NOTDIGIT._desc = "NOTDIGIT"
Loading