RegexBuilder

RegexBuilder is a class that helps you build regular expressions. It is based on the SuperExpressive library. The API uses the fluent builder pattern, and is completely immutable. It is built to be discoverable and predictable.

  • Properties and methods describe what they do in plain English.

  • Order matters! Quantifiers are specified before the thing they change, just like in English (e.g. RegexBuilder().exactly(5).digit().)

  • If you make a mistake, you’ll know how to fix it. Edify will guide you towards a fix if your expression is invalid.

  • subexpressions can be used to create meaningful, reusable components.

.any_char()

.any_char() matches any single character.

from edify import RegexBuilder

# returns re.compile('.')
expr = RegexBuilder().any_char().to_regex()
assert expr.match('a') # Matches
assert expr.match('hello') # Matches

.whitespace_char()

.whitespace_char() matches any whitespace character, including the special whitespace characters: \r\n\t\f\v.

from edify import RegexBuilder

# returns re.compile('\s')
expr = RegexBuilder().whitespace_char().to_regex()
assert expr.match(' ') # Matches
assert expr.match('\n') # Matches
assert expr.match('\t') # Matches
assert expr.match('\r') # Matches
assert expr.match('\f') # Matches
assert expr.match('\v') # Matches
assert not expr.match('a') # Doesn't match
assert not expr.match('hello') # Doesn't match

.non_whitespace_char()

.non_whitespace_char() matches any non-whitespace character, excluding also the special whitespace characters: \r\n\t\f\v.

from edify import RegexBuilder

# returns re.compile('\S')
expr = RegexBuilder().non_whitespace_char().to_regex()
assert expr.match('a') # Matches
assert expr.match('hello') # Matches
assert not expr.match(' ') # Doesn't match
assert not expr.match('\n') # Doesn't match
assert not expr.match('\t') # Doesn't match
assert not expr.match('\r') # Doesn't match
assert not expr.match('\f') # Doesn't match
assert not expr.match('\v') # Doesn't match
assert not expr.match('\u00a0') # Doesn't match
assert not expr.match('\u2000') # Doesn't match

.digit()

.digit() matches any digit from 0-9.

from edify import RegexBuilder

# returns re.compile('\d')
expr = RegexBuilder().digit().to_regex()
assert expr.match('1') # Matches
assert expr.match('9') # Matches
assert not expr.match('a') # Doesn't match
assert not expr.match('\u00a0') # Doesn't match

.non_digit()

.non_digit() matches any non-digit.

from edify import RegexBuilder

# returns re.compile('\D')
expr = RegexBuilder().non_digit().to_regex()
assert expr.match('a') # Matches
assert expr.match('\u00a0') # Matches
assert not expr.match('1') # Doesn't match
assert not expr.match('9') # Doesn't match

.word()

.word() matches any alpha-numeric (a-z, A-Z, 0-9) characters, as well as _.

from edify import RegexBuilder

# returns re.compile('\w')
expr = RegexBuilder().word().to_regex()
assert expr.match('a') # Matches
assert expr.match('1') # Matches
assert expr.match('_') # Matches
assert expr.match('hello') # Matches

.non_word()

.non_word() matches any non-alpha-numeric (a-z, A-Z, 0-9) characters, excluding _ as well.

from edify import RegexBuilder

# returns re.compile('\W')
expr = RegexBuilder().non_word().to_regex()
assert not expr.match('a') # Doesn't match
assert not expr.match('1') # Doesn't match
assert expr.match('\u00a0') # Matches
assert expr.match('\u2000') # Matches
assert not expr.match('_') # Doesn't match
assert not expr.match('hello') # Doesn't match

.word_boundary()

.word_boundary() matches (without consuming any characters) immediately between a character matched by .word() and a character not matched by .word() (in either order).

from edify import RegexBuilder

# returns re.compile('\d\b')
expr = RegexBuilder().digit().word_boundary().to_regex()

.non_word_boundary()

.non_word_boundary() matches (without consuming any characters) at the position between two characters matched by .word().

from edify import RegexBuilder

# returns re.compile('\d\B')
expr = RegexBuilder().digit().non_word_boundary().to_regex()

.new_line()

.new_line() matches the newline \n character.

from edify import RegexBuilder

# returns re.compile('\n')
expr = RegexBuilder().new_line().to_regex()
assert expr.match('\n') # Matches
assert not expr.match('a') # Doesn't match
assert not expr.match('hello') # Doesn't match

.carriage_return()

.carriage_return() matches the carriage return \r character.

from edify import RegexBuilder

# returns re.compile('\r')
expr = RegexBuilder().carriage_return().to_regex()
assert expr.match('\r') # Matches
assert not expr.match('a') # Doesn't match
assert not expr.match('hello') # Doesn't match

.tab()

.tab() matches the tab \t character.

from edify import RegexBuilder

# returns re.compile('\t')
expr = RegexBuilder().tab().to_regex()
assert expr.match('\t') # Matches
assert not expr.match('a') # Doesn't match
assert not expr.match('hello') # Doesn't match

.null_byte()

.null_byte() matches the null byte \0 character.

from edify import RegexBuilder

# returns re.compile('\0')
expr = RegexBuilder().null_byte().to_regex()
assert expr.match('\0') # Matches
assert not expr.match('a') # Doesn't match
assert not expr.match('hello') # Doesn't match

.any_of()

.any_of() matches a choice between specified elements. Needs to be finalised with .end().

from edify import RegexBuilder

# returns re.compile('(?:hello|[a-f0-9])')
expr = (
    RegexBuilder()
    .any_of()
        .range('a', 'f')
        .range('0', '9')
        .string('hello')
    .end()
    .to_regex()
)
assert expr.match('a') # Matches
assert expr.match('f') # Matches
assert expr.match('9') # Matches
assert expr.match('hello') # Matches
assert not expr.match('g') # Doesn't match
assert not expr.match('good world') # Doesn't match

.capture()

.capture() creates a capture group for the proceeding elements. Needs to be finalised with .end(). Can be later referenced with .back_reference(index).

from edify import RegexBuilder

# returns re.compile('([a-f][0-9]hello)')
expr = (
    RegexBuilder()
    .capture()
        .range('a', 'f')
        .range('0', '9')
        .string('hello')
    .end()
    .to_regex()
)
assert expr.match('a9hello') # Matches
assert expr.match('f0hello') # Matches
assert not expr.match('g9hello') # Doesn't match

.named_capture(name)

.named_capture() creates a named capture group for the proceeding elements. Needs to be finalised with .end(). Can be later referenced with .named_back_reference(name) or .back_reference(index).

from edify import RegexBuilder

# returns re.compile('(?P<interestingStuff>[a-f][0-9]hello)')
expr = (
    RegexBuilder()
    .named_capture('interestingStuff')
        .range('a', 'f')
        .range('0', '9')
        .string('hello')
    .end()
    .to_regex()
)
assert expr.match('a9hello') # Matches
assert expr.match('f0hello') # Matches
assert not expr.match('g9hello') # Doesn't match

.named_back_reference(name)

.named_back_reference() matches exactly what was previously matched by a .named_capture(name).

Warning

Python does not support named back references. If you try to call the to_regex() method on a named back reference, it will raise an exception. For, those reasons, to_regex_string() is provided instead. It returns a string that can be used to create a regular expression. You can try using the regular expression directly with another library like regex.

from edify import RegexBuilder

# returns /(?<interestingStuff>[a-f][0-9]hello)something else\k<interestingStuff>/
expr = (
    RegexBuilder()
    .named_capture('interestingStuff')
        .range('a', 'f')
        .range('0', '9')
        .string('hello')
    .end()
    .string('something else')
    .named_back_reference('interestingStuff')
    .to_regex_string()
)

.back_reference(index)

.back_reference() matches exactly what was previously matched by a .capture() or .named_capture(name) using a positional index. Note regex indexes start at 1, so the first capture group has index 1.

from edify import RegexBuilder

# returns re.compile('([a-f][0-9]hello)\\1')
expr = (
    RegexBuilder()
    .capture()
        .range('a', 'f')
        .range('0', '9')
        .string('hello')
    .end()
    .back_reference(1)
    .to_regex()
)
assert expr.match('a9helloa9hello') # Matches
assert not expr.match('a9helloa9hell') # Doesn't match

.group()

.group() creates a non-capturing group for the proceeding elements. Needs to be finalised with .end().

from edify import RegexBuilder

# returns re.compile('(?:[a-f][0-9]hello)?')
expr = (
    RegexBuilder()
    .optional().group()
        .range('a', 'f')
        .range('0', '9')
        .string('hello')
    .end()
    .to_regex()
)
assert expr.match('a9hello') # Matches
assert expr.match('') # Matches
assert not expr.match('g9hello') # Matches

.end()

.end() signifies the end of a RegexBuilder grouping, such as .capture(), .group() or .any_of() element.

from edify import RegexBuilder

# returns re.compile('((?:hello|[a-f0-9]))')
expr = (
    RegexBuilder()
    .capture()
        .any_of()
            .range('a', 'f')
            .range('0', '9')
            .string('hello')
        .end()
    .end()
    .to_regex()
)

.assert_ahead()

.assert_ahead() asserts that the proceeding elements are found without consuming them. Needs to be finalised with .end().

from edify import RegexBuilder

# returns re.compile('(?=[a-f])[a-z]')
expr = (
    RegexBuilder()
    .assert_ahead()
        .range('a', 'f')
    .end()
    .range('a', 'z')
    .to_regex()
)
assert expr.match('a') # Matches
assert expr.match('f') # Matches
assert not expr.match('g') # Doesn't match

.assert_not_ahead()

.assert_not_ahead() asserts that the proceeding elements are not found without consuming them. Needs to be finalised with .end().

from edify import RegexBuilder

# returns re.compile('(?![a-f])[g-z]')
expr = (
    RegexBuilder()
    .assert_not_ahead()
        .range('a', 'f')
    .end()
    .range('g', 'z')
    .to_regex()
)
assert expr.match('g') # Matches
assert expr.match('z') # Matches
assert not expr.match('a') # Doesn't match

.assert_behind()

.assert_behind() asserts that the elements contained within are found immediately before this point in the string. Needs to be finalised with .end().

from edify import RegexBuilder

# returns re.compile('(?<=hello )world')
expr = (
    RegexBuilder()
    .assert_behind()
        .string('hello ')
    .end()
    .string('world')
    .to_regex()
)

.assert_not_behind()

.assert_not_behind() asserts that the elements contained within are not found immediately before this point in the string. Needs to be finalised with .end().

from edify import RegexBuilder

# returns re.compile('(?<!hello )world')
expr = (
    RegexBuilder()
    .assert_not_behind()
        .string('hello ')
    .end()
    .string('world')
    .to_regex()
)

.optional()

.optional() asserts that the proceeding element may or may not be matched.

from edify import RegexBuilder

# returns re.compile('\d?')
expr = (
    RegexBuilder()
    .optional()
        .digit()
    .to_regex()
)

.zero_or_more()

.zero_or_more() asserts that the proceeding element may not be matched, or may be matched multiple times.

from edify import RegexBuilder

# returns re.compile('\d*')
expr = (
    RegexBuilder()
    .zero_or_more()
        .digit()
    .to_regex()
)

.zero_or_more_lazy()

.zero_or_more_lazy() asserts that the proceeding element may not be matched, or may be matched multiple times, but as few times as possible.

from edify import RegexBuilder

# returns re.compile('\d*?')
expr = (
    RegexBuilder()
    .zero_or_more_lazy()
        .digit()
    .to_regex()
)

.one_or_more()

.one_or_more() asserts that the proceeding element may be matched once or more times.

from edify import RegexBuilder

# returns re.compile('\d+')
expr = (
    RegexBuilder()
    .one_or_more()
        .digit()
    .to_regex()
)

.one_or_more_lazy()

.one_or_more_lazy() asserts that the proceeding element may be matched once or more times, but as few times as possible.

from edify import RegexBuilder

# returns re.compile('\d+?')
expr = (
    RegexBuilder()
    .one_or_more_lazy()
        .digit()
    .to_regex()
)

.exactly(n)

.exactly(n) asserts that the proceeding element will be matched exactly n times.

from edify import RegexBuilder

# returns re.compile('\d{3}')
expr = (
    RegexBuilder()
    .exactly(3)
        .digit()
    .to_regex()
)

.at_least(n)

.at_least(n) asserts that the proceeding element will be matched at least n times.

from edify import RegexBuilder

# returns re.compile('\d{3,}')
expr = (
    RegexBuilder()
    .at_least(3)
        .digit()
    .to_regex()
)

.between(n, m)

.between(n, m) asserts that the proceeding element will be matched between n and m times.

from edify import RegexBuilder

# returns re.compile('\d{3,5}')
expr = (
    RegexBuilder()
    .between(3, 5)
        .digit()
    .to_regex()
)

.between_lazy(n, m)

.between_lazy(n, m) asserts that the proceeding element will be matched between n and m times, but as few times as possible.

from edify import RegexBuilder

# returns re.compile('\d{3,5}?')
expr = (
    RegexBuilder()
    .between_lazy(3, 5)
        .digit()
    .to_regex()
)

.start_of_input()

.start_of_input() asserts the start of input, or the start of a line when M flag is used.

from edify import RegexBuilder

# returns re.compile('^hello')
expr = (
    RegexBuilder()
    .start_of_input()
    .string('hello')
    .to_regex()
)

.end_of_input()

.end_of_input() asserts the end of input, or the end of a line when M flag is used.

from edify import RegexBuilder

# returns re.compile('hello$')
expr = (
    RegexBuilder()
    .string('hello')
    .end_of_input()
    .to_regex()
)

.any_of_chars(chars)

.any_of_chars(chars) matches any of the characters in the provided string chars.

from edify import RegexBuilder

# returns re.compile('[abc]')
expr = (
    RegexBuilder()
    .any_of_chars('abc')
    .to_regex()
)

.anything_but_chars(chars)

.anything_but_chars(chars) matches any character except those in the provided string chars.

from edify import RegexBuilder

# returns re.compile('[^abc]')
expr = (
    RegexBuilder()
    .anything_but_chars('abc')
    .to_regex()
)

.anything_but_string(string)

.anything_but_string(string) matches any string the same length as string, except the characters sequentially defined in string.

from edify import RegexBuilder

# returns re.compile('(?:[^a][^e][^i][^o][^u])')
expr = (
    RegexBuilder()
    .anything_but_string('aeiou')
    .to_regex()
)

.anything_but_range(start, end)

.anything_but_range(start, end) matches any character except those that would be captured by the .range(start, end) specified by start and end.

from edify import RegexBuilder

# returns re.compile('[^a-z]')
expr = (
    RegexBuilder()
    .anything_but_range('a', 'z')
    .to_regex()
)

.string(s)

.string(string) matches the exact string s.

from edify import RegexBuilder

# returns re.compile('hello')
expr = (
    RegexBuilder()
    .string('hello')
    .to_regex()
)

.char(c)

.char(c) matches the exact character c.

from edify import RegexBuilder

# returns re.compile('h')
expr = (
    RegexBuilder()
    .char('h')
    .to_regex()
)

.range(start, end)

.range(start, end) matches any character that falls between start and end. Ordering is defined by a characters ASCII or unicode value.

from edify import RegexBuilder

# returns re.compile('[a-z]')
expr = (
    RegexBuilder()
    .range('a', 'z')
    .to_regex()
)

.subexpression(expr, opts)

.subexpression() matches another RegexBuilder instance inline. Can be used to create libraries, or to modularise you code. By default, flags and start/end of input markers are ignored, but can be explcitly turned on in the options object.

opts is an optional dictionary that can be used to control how the subexpression is treated. It has the following properties:

namespace

A string namespace to use on all named capture groups in the subexpression, to avoid naming collisions with your own named groups. Defaults to ' '.

ignore_flags

If set to True, any flags this subexpression specifies should be disregarded. Defaults to True.

ignore_start_and_end

If set to True, any start_of_input/end_of_input asserted in this subexpression specifies should be disregarded. Defaults to True.

A sample opts dictionary might look like this:

opts = {
    'namespace': 'my_namespace',
    'ignore_flags': False,
    'ignore_start_and_end': False
}

You can use the .subexpression() method like this:

from edify import RegexBuilder

# returns re.compile('[a-z]+.{3,}\d{5}')
five_digits = RegexBuilder().exactly(5).digit()
expr = (
    RegexBuilder()
    .one_or_more().range('a', 'z')
    .at_least(3).any_char()
    .subexpression(five_digits)
    .to_regex()
)

.to_regex_string()

.to_regex_string() returns a string representation of the regular expression that this RegexBuilder instance represents.

from edify import RegexBuilder

# returns '/^(?:0x)?([A-Fa-f0-9]{4})$/IM'
expr = (
    RegexBuilder()
    .ignore_case()
    .multiline()
    .start_of_input()
    .optional().string('0x')
    .capture()
        .exactly(4).any_of()
            .range('A', 'F')
            .range('a', 'f')
            .range('0', '9')
        .end()
    .end()
    .end_of_input()
    .to_regex_string()
)

.to_regex()

.to_regex() returns a compiled regular expression object that this RegexBuilder instance represents. The complied regular expression is an instance of re.compile, so any re module methods like .search(), .match(), .findall(), etc. can be used on it.

from edify import RegexBuilder

# returns re.compile('^(?:0x)?([A-Fa-f0-9]{4})$', re.MULTILINE | re.IGNORECASE)
expr = (
    RegexBuilder()
    .ignore_case()
    .multiline()
    .start_of_input()
    .optional().string('0x')
    .capture()
        .exactly(4).any_of()
            .range('A', 'F')
            .range('a', 'f')
            .range('0', '9')
        .end()
    .end()
    .end_of_input()
    .to_regex()
)

# returns re.Match object
expr.match('0x1234')