From 21ad84559b9daf184cc037b89df38969adf36eb1 Mon Sep 17 00:00:00 2001 From: Jonathan Goble Date: Sun, 27 Dec 2015 02:32:16 -0500 Subject: [PATCH] initial commit --- .gitattributes | 1 + .gitignore | 12 + .travis.yml | 35 ++ LICENSE | 21 ++ README.md | 54 +++ setup.cfg | 2 + setup.py | 39 +++ src/luapatt.py | 562 ++++++++++++++++++++++++++++++++ tests/helpers.py | 36 ++ tests/test_custom.py | 95 ++++++ tests/test_lua1_basics.py | 164 ++++++++++ tests/test_lua2_captgsub.py | 199 +++++++++++ tests/test_lua3_gmfronterror.py | 167 ++++++++++ tests/test_lua4_sets.py | 78 +++++ 14 files changed, 1465 insertions(+) create mode 100644 .gitattributes create mode 100644 .gitignore create mode 100644 .travis.yml create mode 100644 LICENSE create mode 100644 README.md create mode 100644 setup.cfg create mode 100644 setup.py create mode 100644 src/luapatt.py create mode 100644 tests/helpers.py create mode 100644 tests/test_custom.py create mode 100644 tests/test_lua1_basics.py create mode 100644 tests/test_lua2_captgsub.py create mode 100644 tests/test_lua3_gmfronterror.py create mode 100644 tests/test_lua4_sets.py diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..fcadb2c --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +* text eol=lf diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0144759 --- /dev/null +++ b/.gitignore @@ -0,0 +1,12 @@ +# Python bytecode +__pycache__/ + +# py.test junk +.cache/ +.coverage +htmlcov/ + +# setuptools artifacts +build/ +dist/ +*.egg-info/ diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..305b22f --- /dev/null +++ b/.travis.yml @@ -0,0 +1,35 @@ +branches: + only: + - develop + - master + +language: python + +python: + - "3.5" + - "3.5-dev" # 3.5 development branch + - "nightly" # currently points to 3.6-dev + +matrix: + allow_failures: + - python: "3.5-dev" + - python: "nightly" + +install: + - pip install flake8 + +script: + - if [[ $TRAVIS_PYTHON_VERSION = 3.5 ]]; then flake8 luapatt; fi + - if [[ $TRAVIS_PYTHON_VERSION = nightly ]]; then python -m pytest; fi + - if [[ $TRAVIS_PYTHON_VERSION = 3.5* ]]; then python3.5 -m pytest; fi + - if [[ $TRAVIS_PYTHON_VERSION = 3.5 ]]; then python3.4 -m pytest; fi + - if [[ $TRAVIS_PYTHON_VERSION = 3.5 ]]; then python3.3 -m pytest; fi + +deploy: + provider: pypi + user: jcgoble3 + on: + branch: master + tags: true + python: "3.5" + distributions: sdist bdist_wheel diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..ade29a6 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2015 Jonathan Goble + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..6e57fdf --- /dev/null +++ b/README.md @@ -0,0 +1,54 @@ +# luapatt + +A Python 3.3+ implementation of the +[Lua language's](http://www.lua.org/home.html) pattern matching functions. +Lua's pattern matching is simpler than regular expressions and lacks several +features that regexes have, such as `|` for alternation, but also contains some +features difficult or impossible to duplicate in most regex flavors, such as +the ability to easily match a balanced pair of parentheses (or any two other +characters). + +## Installation + +`pip install -i https://testpypi.python.org/pypi luapatt` + +Upload to the regular PyPI will come "soon". + +## Documentation + +For documentation on how pattern matching works, please read the +[Lua reference manual](http://www.lua.org/manual/5.3/manual.html#6.4.1). +This library contains the following differences from stock Lua: + +* `%c`, `%g`, `%p`, and their negated counterparts are not available; +attempting to use them will raise the built-in `NotImplementedError`. +* Other character classes that rely on the meaning of a character call Python's +`str.is*` family of methods, and so use the Unicode definition of that meaning. +* String positions are zero-based instead of one-based, reflecting the fact +that Python is generally zero-based (as opposed to Lua, which has one-based +indexes). This affects position captures and the indexes returned as the first +two results from `find()`. +* Function return values are combined into a tuple, as is standard with Python. +However, singleton tuples are not returned; the single value is returned +directly instead. +* `gsub()` does *not* return the number of substitutions by default, instead +returning only the new string. To get the count, pass the named argument +`count=True` to the call (which will result in a 2-tuple of the new string and +the count). +* An extra function, `set_escape_char()`, is provided to change the escape +character. It takes one argument: the new escape character, which must be a +`str` object of length 1. The escape character cannot be set to any of the +other special characters. While it is possible to set it to a letter or number, +this is not recommended as it may interfere with other aspects of pattern +matching, and doing so may be disallowed in the future. + * **NOTE:** Because `set_escape_char` modifies global state, it is **not** +thread-safe. +* Unlike Lua, which has no notion of a Unicode string and assumes all +characters are one byte in length, this library operates on full Unicode +strings (i.e. `str` objects). If you pass bytes objects to this library, the +behavior is undefined. + +## Licensing + +As with Lua itself, this library is released under the MIT License. See the +[`LICENSE` file](./LICENSE) for more details. diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..b88034e --- /dev/null +++ b/setup.cfg @@ -0,0 +1,2 @@ +[metadata] +description-file = README.md diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..9814e9b --- /dev/null +++ b/setup.py @@ -0,0 +1,39 @@ +from setuptools import setup, find_packages + +from codecs import open +from os import path + +here = path.abspath(path.dirname(__file__)) + +with open(path.join(here, 'README.md'), encoding='utf-8') as f: + long_description = f.read() + +setup( + name='luapatt', + version='0.9.0b1', + description='Python implementation of Lua-style pattern matching', + long_description=long_description, + url='https://github.com/jcgoble3/luapatt', + author='Jonathan Goble', + author_email='jcgoble3@gmail.com', + license='MIT', + classifiers=[ + 'Development Status :: 4 - Beta', + # 'Development Status :: 5 - Production/Stable', + 'Intended Audience :: Developers', + 'License :: OSI Approved :: MIT License', + 'Operating System :: OS Independent', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3 :: Only', + 'Programming Language :: Python :: 3.3', + 'Programming Language :: Python :: 3.4', + 'Programming Language :: Python :: 3.5', + 'Topic :: Software Development :: Libraries :: Python Modules', + 'Topic :: Text Processing :: General' + ], + keywords='Lua pattern matching regex regular expressions', + package_dir={'': 'src'}, + py_modules=["luapatt"], + install_requires=[], # no dependencies + extras_require={'test': ['pytest']} +) diff --git a/src/luapatt.py b/src/luapatt.py new file mode 100644 index 0000000..2d3f3db --- /dev/null +++ b/src/luapatt.py @@ -0,0 +1,562 @@ +# Copyright 2015 Jonathan Goble +# +# Permission is hereby granted, free of charge, to any person obtaining +# a copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. + +from array import array + +_ARRAYTYPECODES = {} +for code in 'bhilq': + try: + size = 1 << ((array(code).itemsize << 3) - 1) + except ValueError: # 'q' might not be available + pass + else: + if size not in _ARRAYTYPECODES: + _ARRAYTYPECODES[size] = code +_ARRAYSIZES = list(_ARRAYTYPECODES.keys()) +_ARRAYSIZES.sort() +del size, code + +MAXCAPTURES = 100 +MAXRECURSION = 200 +ESCAPE = '%' +SPECIALS = '^$*+?.([-' +UNFINISHEDCAPTURE = -1 +POSITIONCAPTURE = -2 +PLACEHOLDER = -3 + + +class PatternError(Exception): + '''Base class for all pattern-related errors in this module.''' + + +class PatternSyntaxError(PatternError): + '''All syntax errors; argument is description of problem.''' + + +class PatternLongSourceError(PatternError): + def __str__(self): + return 'source string too long' + + +class PatternTooComplex(PatternError): + '''Base class for when the pattern exceeds complexity limits.''' + + +class PatternStackOverflow(PatternTooComplex): + def __str__(self): + return 'pattern too complex, exceeded recursion limit' + + +class PatternTooManyCaptures(PatternTooComplex): + def __str__(self): + return 'too many captures' + + +class _MatchState: + def __init__(self, source, pattern, noanchor=False): + self.matchdepth = MAXRECURSION + self.source = source + self.srcstart = 0 + self.capturenum = 0 + sourcelen = len(source) + for size in _ARRAYSIZES: + if sourcelen < size: + typecode = _ARRAYTYPECODES[size] + break + else: + raise PatternLongSourceError + self.capturestarts = array( + typecode, (PLACEHOLDER for _ in range(MAXCAPTURES)) + ) + self.captureends = array( + typecode, (PLACEHOLDER for _ in range(MAXCAPTURES)) + ) + + def reset(self, init): + self.capturenum = 0 + self.srcstart = init + self.matchdepth = MAXRECURSION + + def getsinglecapture(self, num): + start = self.capturestarts[num] + end = self.captureends[num] + if end == POSITIONCAPTURE: + return start + else: + return self.source[start:end] + + def getcaptures(self, sp): + if self.capturenum == 0: + return [self.source[self.srcstart:sp]] + else: + return [self.getsinglecapture(n) for n + in range(self.capturenum)] + + +class _PatternMatcher: + def __init__(self, source, pattern, noanchor=False): + self.source = source + self.srclen = len(source) + self.pattern = pattern + self.pattlen = len(pattern) + self.anchor = False if noanchor else (self.pattlen > 0 and + pattern[0] == '^') + + @property + def nospecials(self): + return not any(c in self.pattern for c in SPECIALS + ESCAPE) + + def find_aux(self, type, init=0, plain=False): + if init < 0: + init = 0 + if init > len(self.source): # start after source's end? + return None # no chance of finding anything + if type == 'find' and (plain or self.nospecials): + start = self.source.find(self.pattern, init) # built-in str.find() + if start > -1: + return (start, start + len(self.pattern)) + else: + init -= 1 + self.state = _MatchState(self.source, self.pattern) + first = True + pp = 1 if self.anchor else 0 + while first or ((self.state.srcstart < self.srclen) and + not self.anchor): + first = False + init += 1 + self.state.reset(init) + sp = self.match(init, pp) + if sp is not None: + if type == 'find': + ret = [init, sp] + if self.state.capturenum != 0: + ret.extend(self.state.getcaptures(sp)) + return tuple(ret) + elif type == 'match': + result = tuple(self.state.getcaptures(sp)) + if len(result) == 1: + return result[0] + else: + return result + elif type == 'gmatch': + captures = tuple(self.state.getcaptures(sp)) + if len(captures) == 1: + return (init, sp), captures[0] + else: + return (init, sp), captures + elif type == 'gsub': + return (init, sp), tuple(self.state.getcaptures(sp)) + return None + + def _subst_str(self, captures, repl, matchstart, matchend): + char = 0 + rlen = len(repl) + accum = [] + while char < rlen: + c = repl[char] + if c != ESCAPE: + accum.append(c) + else: + char += 1 + if char == rlen: + raise PatternSyntaxError( + "replacement string ends with bare '{}'".format(ESCAPE) + ) + c = repl[char] + if c == ESCAPE: + accum.append(ESCAPE) + elif c in '123456789': + if c == '1': + c = 0 + else: + c = self.checkcapture(int(c)) + accum.append(str(captures[c])) + elif c == '0': + accum.append( + str(self.source[matchstart:matchend]) + ) + else: + raise PatternSyntaxError( + "invalid '{}{}' in replacement " + "string".format(ESCAPE, c) + ) + char += 1 + return ''.join(accum) + + def subst(self, captures, repl, matchstart, matchend): + if callable(repl): + value = repl(*captures) + elif type(repl) == dict: + value = repl.get(captures[0]) + else: + value = self._subst_str(captures, str(repl), matchstart, matchend) + if value is None or value is False: + value = self.source[matchstart:matchend] + return str(value) + + def match(self, sp, pp): + if self.state.matchdepth == 0: + raise PatternStackOverflow + self.state.matchdepth -= 1 + while pp < self.pattlen: + pc = self.pattern[pp] + try: + pc1 = self.pattern[pp + 1] + except IndexError: + pc1 = None + if pc == '(': + if pc1 == ')': + sp = self.startcapture(sp, pp + 2, POSITIONCAPTURE) + else: + sp = self.startcapture(sp, pp + 1, UNFINISHEDCAPTURE) + break + elif pc == ')': + sp = self.endcapture(sp, pp + 1) + break + elif pc == '$' and pp + 1 == self.pattlen: + if sp != self.srclen: + sp = None + break + elif pc == ESCAPE: + if pc1 is None: + raise PatternSyntaxError( + "pattern ends with bare '{}'".format(ESCAPE) + ) + elif pc1 == 'b': + sp = self.matchbalance(sp, pp + 2) + if sp is None: + break + pp += 4 + continue + elif pc1 == 'f': + pp += 2 + if pp >= self.pattlen or self.pattern[pp] != '[': + raise PatternSyntaxError( + "missing '[' after '{}f'".format(ESCAPE) + ) + ep = self.classend(pp) + set = self.pattern[pp + 1:ep - 1] + prev = '\0' if sp == 0 else self.source[sp - 1] + next = '\0' if sp >= self.srclen else self.source[sp] + if (not self.matchbracketclass(prev, set) and + self.matchbracketclass(next, set)): + pp = ep + continue + sp = None + break + elif pc1 in '0123456789': + sp = self.matchcapture(sp, int(pc1)) + if sp is None: + break + pp += 2 + continue + # This point can only be reached if all conditions above test + # false. Any true condition above is guaranteed to result in + # either a break, a continue, or raising an error. + ep = self.classend(pp) # ep points to the optional quantifier + try: + qc = self.pattern[ep] + except IndexError: + qc = None + if not self.singlematch(sp, pp, ep): + if qc and qc in '*?-': # allow zero matches? + pp = ep + 1 + continue + sp = None + break + else: # matched once + if qc == '?': + result = self.match(sp + 1, ep + 1) + if result is None: + pp = ep + 1 + continue + else: + sp = result + break + elif qc == '+': + sp = self.maxexpand(sp + 1, pp, ep) + break + elif qc == '*': + sp = self.maxexpand(sp, pp, ep) + break + elif qc == '-': + sp = self.minexpand(sp, pp, ep) + break + else: # no quantifier + sp += 1 + pp = ep + continue + self.state.matchdepth += 1 + return sp + + def matchcapture(self, sp, num): + index = self.checkcapture(num) + cs = self.state.capturestarts[index] + ce = self.state.captureends[index] + cl = ce - cs + if (cl <= self.srclen - sp and + self.source[sp:sp + cl] == self.state.getsinglecapture(index)): + return sp + cl + else: + return None + + def startcapture(self, sp, pp, what): + cnum = self.state.capturenum + if cnum >= MAXCAPTURES: + raise PatternTooManyCaptures + self.state.capturestarts[cnum] = sp + self.state.captureends[cnum] = what + self.state.capturenum += 1 + result = self.match(sp, pp) + if result is None: + self.state.capturenum -= 1 + elif self.state.captureends[cnum] == UNFINISHEDCAPTURE: + raise PatternSyntaxError('unfinished capture') + return result + + def endcapture(self, sp, pp): + index = self.capturetoclose() + self.state.captureends[index] = sp + result = self.match(sp, pp) + if result is None: + self.state.captureends[index] = UNFINISHEDCAPTURE + return result + + def minexpand(self, sp, pp, ep): + while True: + result = self.match(sp, ep + 1) + if result is not None: + return result + elif self.singlematch(sp, pp, ep): + sp += 1 + else: + return None + + def maxexpand(self, sp, pp, ep): + count = 0 + while self.singlematch(sp + count, pp, ep): + count += 1 + while count >= 0: + result = self.match(sp + count, ep + 1) + if result is not None: + return result + count -= 1 + return None + + def matchbalance(self, sp, pp): + if pp > self.pattlen - 2: + raise PatternSyntaxError( + "missing arguments to '{}b')".format(ESCAPE) + ) + b = self.pattern[pp] + if sp == self.srclen or self.source[sp] != b: + return None + e = self.pattern[pp + 1] + level = 1 + sp += 1 + while sp < self.srclen: + sc = self.source[sp] + if sc == e: + level -= 1 + if level == 0: + return sp + 1 + elif sc == b: + level += 1 + sp += 1 + return None + + def singlematch(self, sp, pp, ep): + if sp >= self.srclen: + return False + sc = self.source[sp] + pc = self.pattern[pp] + if pc == '.': + return True + elif pc == ESCAPE: + return self.matchclass(sc, self.pattern[pp + 1]) + elif pc == '[': + return self.matchbracketclass(sc, self.pattern[pp + 1:ep - 1]) + else: + return sc == pc + + def matchbracketclass(self, sc, set): + if set[0] == '^': + signal = False + pos = 1 + else: + signal = True + pos = 0 + sl = len(set) + while pos < sl: + pc = set[pos] + try: + pc1 = set[pos + 1] + except IndexError: + pc1 = None + if pc == ESCAPE: + pos += 1 + if self.matchclass(sc, pc1): + return signal + elif pc1 == '-' and pos + 2 < sl: + pos += 2 + if pc <= sc <= set[pos]: + return signal + elif pc == sc: + return signal + pos += 1 + return not signal + + def matchclass(self, sc, pc): + pcl = pc.lower() + if pcl == 'a': + match = sc.isalpha() + elif pcl == 'd': + match = sc.isdigit() + elif pcl == 'l': + match = sc.islower() + elif pcl == 's': + match = sc.isspace() + elif pcl == 'u': + match = sc.isupper() + elif pcl == 'w': + match = sc.isalpha() or sc.isdigit() + elif pcl == 'x': + match = sc.isdigit() or sc in 'abcdefABCDEF' + elif pcl == 'z': + match = sc == '\0' + elif pcl in 'cgp': + raise NotImplementedError('{0}c, {0}g, and {0}p are not ' + 'available'.format(ESCAPE)) + else: + return sc == pc + return match if pc.islower() else not match + + def checkcapture(self, n): + n -= 1 + if (n < 0 or n >= self.state.capturenum or + self.state.captureends[n] == UNFINISHEDCAPTURE): + raise PatternSyntaxError( + 'invalid capture index {}{}'.format(ESCAPE, n + 1) + ) + return n + + def capturetoclose(self): + for index in range(self.state.capturenum - 1, -1, -1): + if self.state.captureends[index] == UNFINISHEDCAPTURE: + return index + raise PatternSyntaxError("unmatched ')'") + + def classend(self, pp): + pc = self.pattern[pp] + pp += 1 + if pc == ESCAPE: + # The error case of a pattern ending with a bare ESCAPE is handled + # in self.match() before this is ever called. + return pp + 1 + elif pc == '[': + try: + if self.pattern[pp] == '^': + pp += 1 + if self.pattern[pp] == ']': + pp += 1 + while self.pattern[pp] != ']': + if (self.pattern[pp] == ESCAPE and + pp + 1 < self.pattlen): + pp += 2 + else: + pp += 1 + return pp + 1 + except IndexError: + raise PatternSyntaxError("missing ']'") from None + else: + return pp + + +#################### +# Public functions # +#################### + + +def set_escape_char(char): + global ESCAPE + if not isinstance(char, str): + raise TypeError('"char" must be a unicode character') + if len(char) != 1: + raise ValueError('"char" must be a single character') + invalidescapes = SPECIALS + ')]' + if char in invalidescapes: + raise ValueError('"char" cannot be any of "{}"'.format(invalidescapes)) + ESCAPE = char + + +def find(source, pattern, init=0, plain=False): + matcher = _PatternMatcher(source, pattern) + return matcher.find_aux(type='find', init=init, plain=plain) + + +def match(source, pattern, init=0): + matcher = _PatternMatcher(source, pattern) + return matcher.find_aux(type='match', init=init, plain=False) + + +def gmatch(source, pattern): + matcher = _PatternMatcher(source, pattern, noanchor=True) + init = 0 + result = True + while result: + result = matcher.find_aux(type='gmatch', init=init, plain=False) + if result: + newstart = result[0][1] + if newstart == result[0][0]: # empty match at starting position? + newstart += 1 # go forward at least one character + init = newstart + yield result[1] + + +def gsub(source, pattern, repl, limit=None, count=False): + matcher = _PatternMatcher(source, pattern) + accum = [] + init = 0 + replcount = 0 + if matcher.anchor: + limit = 1 # not possible to match more than one if anchored at start + elif limit is None: + # Maximum possible substitutions is one more than len(source) + # e.g. luapatt.gsub('test', '.-', '=') returns '=t=e=s=t=' + limit = len(source) + 1 + while replcount < limit: + result = matcher.find_aux(type='gsub', init=init, plain=False) + if not result: + break + replcount += 1 + matchstart, matchend = result[0] + accum.append(source[init:matchstart]) + accum.append(matcher.subst(result[1], repl, matchstart, matchend)) + init = matchend + if matchstart == matchend: # empty match? + if matchend < matcher.srclen: + accum.append(source[matchend]) # skip a character + init += 1 + accum.append(source[init:]) # collect the rest of the source string + finalstring = ''.join(accum) + if count: + return finalstring, replcount + else: + return finalstring diff --git a/tests/helpers.py b/tests/helpers.py new file mode 100644 index 0000000..12accbe --- /dev/null +++ b/tests/helpers.py @@ -0,0 +1,36 @@ +# coding: utf-8 + +# Copyright 2015 Jonathan Goble +# +# Permission is hereby granted, free of charge, to any person obtaining +# a copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. + +import pytest + +import sys +sys.path.insert(0, r'src') +import luapatt + +def checkerror(exc, msg, f, *args): + with pytest.raises(exc) as e: + f(*args) + if msg: + assert msg in str(e.value) + +def syntaxerror(p, m): + checkerror(luapatt.PatternSyntaxError, m, luapatt.find, 'a', p) diff --git a/tests/test_custom.py b/tests/test_custom.py new file mode 100644 index 0000000..5441efe --- /dev/null +++ b/tests/test_custom.py @@ -0,0 +1,95 @@ +# coding: utf-8 + +# Copyright 2015 Jonathan Goble +# +# Permission is hereby granted, free of charge, to any person obtaining +# a copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. + +import sys +sys.path.insert(0, r'src') + +import luapatt + +from helpers import checkerror + + +### SET_ESCAPE_CHAR + +class TestSetEscapeChar: + def setup_method(self, _): + luapatt.set_escape_char('%') + + def test_valid_escape(self): + luapatt.set_escape_char('@') + assert luapatt.ESCAPE == '@' + def test_invalid_escape(self): + checkerror(ValueError, 'cannot be any of', + luapatt.set_escape_char, '+') + assert luapatt.ESCAPE == '%' + def test_long_escape(self): + checkerror(ValueError, 'must be a single character', + luapatt.set_escape_char, '<>') + assert luapatt.ESCAPE == '%' + def test_null_escape(self): + checkerror(ValueError, 'must be a single character', + luapatt.set_escape_char, '') + assert luapatt.ESCAPE == '%' + def test_bytes_escape(self): + checkerror(TypeError, 'must be a unicode character', + luapatt.set_escape_char, b'@') + assert luapatt.ESCAPE == '%' + def test_non_string_escape(self): + checkerror(TypeError, 'must be a unicode character', + luapatt.set_escape_char, 42) + assert luapatt.ESCAPE == '%' + + def teardown_method(self, _): + luapatt.set_escape_char('%') + + +### ADDITIONAL TESTS FOR COMPLETE CODE COVERAGE + +def test_find_negative_init(): + assert luapatt.find('test', 't', -1) == (0, 1) + +def test_find_captures(): + assert luapatt.find(' test ', '([^ ]+)') == (2, 6, 'test') + +def test_error_gsub_repl_bare_percent(): + checkerror(luapatt.PatternSyntaxError, + "replacement string ends with bare '%'", + luapatt.gsub, 'test', 'te', 'fail%') + +def test_error_gsub_repl_escaped_escape(): + assert luapatt.gsub('4', '4', '%%') == '%' + +def test_too_many_captures(): + checkerror(luapatt.PatternTooManyCaptures, 'too many captures', + luapatt.find, 'test', '()' * (luapatt.MAXCAPTURES * 2)) + +def test_minexpand_fail(): + assert luapatt.find('test', 'te-x') is None + +def test_percent_g(): + checkerror(NotImplementedError, None, luapatt.find, 'test', '%g') + +def test_percent_c(): + checkerror(NotImplementedError, None, luapatt.find, 'test', '%c') + +def test_percent_p(): + checkerror(NotImplementedError, None, luapatt.find, 'test', '%p') diff --git a/tests/test_lua1_basics.py b/tests/test_lua1_basics.py new file mode 100644 index 0000000..7f25416 --- /dev/null +++ b/tests/test_lua1_basics.py @@ -0,0 +1,164 @@ +# coding: utf-8 + +# Copyright 2015 Jonathan Goble +# +# Permission is hereby granted, free of charge, to any person obtaining +# a copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. + +# Copied from the official Lua 5.3.2 test suite and converted to Python + +import sys +sys.path.insert(0, r'src') + +import luapatt + + +### BASIC FIND TESTS + +# empty patterns are tricky +def test_empty_empty(): + assert luapatt.find('', '') == (0, 0) +def test_plain_empty(): + assert luapatt.find('alo', '') == (0, 0) +# first position +def test_first_char(): + assert luapatt.find('a\0o a\0o a\0o', 'a', 0) == (0, 1) +# starts in the middle +def test_substr_expinit_1(): + assert luapatt.find('a\0o a\0o a\0o', 'a\0o', 1) == (4, 7) +# starts in the middle +def test_substr_expinit_2(): + assert luapatt.find('a\0o a\0o a\0o', 'a\0o', 8) == (8, 11) +# finds at the end +def test_substr_atend(): + assert luapatt.find('a\0a\0a\0a\0\0ab', '\0ab', 1) == (8, 11) +# last position +def test_last_char(): + assert luapatt.find('a\0a\0a\0a\0\0ab', 'b') == (10, 11) +# check ending +def test_nomatch_pastend(): + assert luapatt.find('a\0a\0a\0a\0\0ab', 'b\0') is None +def test_nomatch_pastend_nullsrc(): + assert luapatt.find('', '\0') is None +def test_substr(): + assert luapatt.find('alo123alo', '12') == (3, 5) + + +### QUANTIFIERS AND ANCHORS + +def test_nomatch_startanchor(): + assert luapatt.find('alo^123alo', '^12') is None + +def test_dot_asterisk_basic(): + assert luapatt.match("aaab", ".*b") == "aaab" +def test_dot_asterisk_backtrack1(): + assert luapatt.match("aaa", ".*a") == "aaa" +def test_dot_asterisk_matchzero(): + assert luapatt.match("b", ".*b") == "b" + +def test_dot_plus_basic(): + assert luapatt.match("aaab", ".+b") == "aaab" +def test_dot_plus_backtrack1(): + assert luapatt.match("aaa", ".+a") == "aaa" +def test_dot_plus_failzero(): + assert luapatt.match("b", ".+b") is None + +def test_dot_question_basic_1(): + assert luapatt.match("aaab", ".?b") == "ab" +def test_dot_question_basic_2(): + assert luapatt.match("aaa", ".?a") == "aa" +def test_dot_question_matchzero(): + assert luapatt.match("b", ".?b") == "b" + +def test_percent_l(): + assert luapatt.match('aloALO', '%l*') == 'alo' +def test_percent_a(): + assert luapatt.match('aLo_ALO', '%a*') == 'aLo' + +def test_plain_asterisk(): + assert luapatt.match('aaab', 'a*') == 'aaa' +def test_full_match_asterisk(): + assert luapatt.match('aaa', '^.*$') == 'aaa' +def test_asterisk_null_match(): + assert luapatt.match('aaa', 'b*') == '' +def test_asterisk_null_match_2(): + assert luapatt.match('aaa', 'ab*a') == 'aa' +def test_asterisk_match_one(): + assert luapatt.match('aba', 'ab*a') == 'aba' +def test_plain_plus(): + assert luapatt.match('aaab', 'a+') == 'aaa' +def test_full_match_plus(): + assert luapatt.match('aaa', '^.+$') == 'aaa' +def test_plain_plus_failzero(): + assert luapatt.match('aaa', 'b+') is None +def test_plain_plus_failzero_2(): + assert luapatt.match('aaa', 'ab+a') is None +def test_plus_match_one(): + assert luapatt.match('aba', 'ab+a') == 'aba' +def test_end_anchor(): + assert luapatt.match('a$a', '.$') == 'a' +def test_escaped_end_anchor(): + assert luapatt.match('a$a', '.%$') == 'a$' +def test_dollarsign_inmiddle(): + assert luapatt.match('a$a', '.$.') == 'a$a' +def test_double_dollarsign(): + assert luapatt.match('a$a', '$$') is None +def test_end_anchor_nomatch(): + assert luapatt.match('a$b', 'a$') is None +def test_end_anchor_matchnull(): + assert luapatt.match('a$a', '$') == '' +def test_asterisk_match_nullstring(): + assert luapatt.match('', 'b*') == '' +def test_plain_nomatch(): + assert luapatt.match('aaa', 'bb*') is None +def test_minus_match_zero(): + assert luapatt.match('aaab', 'a-') == '' +def test_full_match_minus(): + assert luapatt.match('aaa', '^.-$') == 'aaa' +def test_asterisk_maxexpand(): + assert luapatt.match('aabaaabaaabaaaba', 'b.*b') == 'baaabaaabaaab' +def test_minus_minexpand(): + assert luapatt.match('aabaaabaaabaaaba', 'b.-b') == 'baaab' +def test_dot_plain_endanchor(): + assert luapatt.match('alo xo', '.o$') == 'xo' +def test_class_x2_asterisk(): + assert luapatt.match(' \n isto é assim', '%S%S*') == 'isto' +def test_class_asterisk_endanchor(): + assert luapatt.match(' \n isto é assim', '%S*$') == 'assim' +def test_set_asterisk_endanchor(): + assert luapatt.match(' \n isto é assim', '[a-z]*$') == 'assim' +def test_negatedset_with_class(): + assert luapatt.match('um caracter ? extra', '[^%sa-z]') == '?' +def test_question_match_zero(): + assert luapatt.match('', 'a?') == '' +def test_question_match_one(): + assert luapatt.match('á', 'á?') == 'á' +def test_multi_question(): + assert luapatt.match('ábl', 'á?b?l?') == 'ábl' +def test_question_match_zero_2(): + assert luapatt.match(' ábl', 'á?b?l?') == '' +def test_question_backtracking(): + assert luapatt.match('aa', '^aa?a?a') == 'aa' + + +### OTHERS + +def test_right_bracket_in_set(): + assert luapatt.match(']]]áb', '[^]]') == 'á' +def test_percent_x(): + assert luapatt.match("0alo alo", "%x*") == "0a" diff --git a/tests/test_lua2_captgsub.py b/tests/test_lua2_captgsub.py new file mode 100644 index 0000000..973b3d9 --- /dev/null +++ b/tests/test_lua2_captgsub.py @@ -0,0 +1,199 @@ +# coding: utf-8 + +# Copyright 2015 Jonathan Goble +# +# Permission is hereby granted, free of charge, to any person obtaining +# a copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. + +# Copied from the official Lua 5.3.2 test suite and converted to Python + +import sys +sys.path.insert(0, r'src') + +import luapatt + +from helpers import checkerror + + +### CAPTURES AND BACKREFERENCES + +def f1(s, p): + p = luapatt.gsub(p, "%%([0-9])", lambda s: "%" + str(int(s)+1)) + p = luapatt.gsub(p, "^(^?)", "%1()", 1) + p = luapatt.gsub(p, "($?)$", "()%1", 1) + t = luapatt.match(s, p) + print(t) + return s[t[0]:t[-1]] +def test_backreference(): + assert f1('alo alx 123 b\0o b\0o', '(..*) %1') == "b\0o b\0o" +def test_two_backreferences(): + assert f1('axz123= 4= 4 34', '(.+)=(.*)=%2 %1') == '3= 4= 4 3' +def test_asterisk_backreference(): + assert f1('=======', '^(=*)=%1$') == '=======' + +def test_backreference_nomatch(): + assert luapatt.match('==========', '^([=]*)=%1$') is None + +def test_capture(): + assert luapatt.match("alo xyzK", "(%w+)K") == "xyz" +def test_capture_null(): + assert luapatt.match("254 K", "(%d*)K") == "" +def test_capture_null_end_anchor(): + assert luapatt.match("alo ", "(%w*)$") == "" +def test_capture_no_match(): + assert luapatt.match("alo ", "(%w+)$") is None +def test_escaped_paren(): + assert luapatt.find("(álo)", "%(á") == (0, 2) +def test_nested_captures(): + assert luapatt.match("âlo alo", "^(((.).).* (%w*))$") == \ + ('âlo alo', 'âl', 'â', 'alo') +def test_nested_captures_and_position_capture(): + assert luapatt.match('0123456789', '(.+(.?)())') == \ + ('0123456789', '', 10) + + +### GSUB + +def test_gsub_basic(): + assert luapatt.gsub('ülo ülo', 'ü', 'x') == 'xlo xlo' +# trim +def test_gsub_end_anchor(): + assert luapatt.gsub('alo úlo ', ' +$', '') == 'alo úlo' +# double trim +def test_gsub_double_anchor(): + assert luapatt.gsub(' alo alo ', '^%s*(.-)%s*$', '%1') == 'alo alo' +def test_gsub_plus(): + assert luapatt.gsub('alo alo \n 123\n ', '%s+', ' ') == 'alo alo 123 ' +def test_gsub_count(): + t = "abç d" + result = luapatt.gsub(t, '(.)', '%1@', count=True) + assert ('@' + result[0], result[1]) == (luapatt.gsub(t, '', '@'), 5) +def test_gsub_init_and_limit(): + assert luapatt.gsub('abçd', '(.)', '%0@', 2, count=True) == \ + ('a@b@çd', 2) +def test_gsub_position(): + assert luapatt.gsub('alo alo', '()[al]', '%1') == '01o 45o' +def test_gsub_captures(): + assert luapatt.gsub("abc=xyz", "(%w*)(=)(%w+)", "%3%2%1-%0") == \ + "xyz=abc-abc=xyz" +def test_gsub_captures_2(): + assert luapatt.gsub("abc", "%w", "%1%0") == "aabbcc" +def test_gsub_captures_3(): + assert luapatt.gsub("abc", "%w+", "%0%1") == "abcabc" +def test_gsub_append(): + assert luapatt.gsub('áéí', '$', '\0óú') == 'áéí\0óú' +def test_gsub_start_anchor_nullstring(): + assert luapatt.gsub('', '^', 'r') == 'r' +def test_gsub_end_anchor_nullstring(): + assert luapatt.gsub('', '$', 'r') == 'r' + +def test_gsub_function(): + assert luapatt.gsub("um (dois) tres (quatro)", "(%(%w+%))", str.upper) \ + == "um (DOIS) tres (QUATRO)" + +def test_gsub_function_sideeffect(): + dic = {} + def setkey(k, v): + dic[k] = v + luapatt.gsub("a=roberto,roberto=a", "(%w+)=(%w%w*)", setkey) + assert dic == {'a': "roberto", 'roberto': "a"} + +def test_gsub_custom_function(): + def f(a, b): + return luapatt.gsub(a, '.', b) + assert luapatt.gsub( + "trocar tudo em |teste|b| é |beleza|al|", "|([^|]*)|([^|]*)|", f + ) == "trocar tudo em bbbbb é alalalalalal" + +def test_gsub_func_accumulator(): + t = {} + s = 'a alo jose joao' + def f(a, w, b): + assert len(w) == b - a + t[a] = b - a + r = luapatt.gsub(s, '()(%w+)()', f) + assert (s, t) == (r, {0: 1, 2: 3, 6: 4, 12: 4}) + + +### BALANCING + +def isbalanced(s): + return luapatt.find(luapatt.gsub(s, "%b()", ""), "[()]") is None + +def test_balancing_1(): + assert isbalanced("(9 ((8))(\0) 7) \0\0 a b ()(c)() a") +def test_balancing_2(): + assert not isbalanced("(9 ((8) 7) a b (\0 c) a") +def test_balancing_dupe_args(): + assert luapatt.gsub("alo 'oi' alo", "%b''", '"') == 'alo " alo' + + +### MORE GSUB + +def test_gsub_function_readfromlist(): + t = ["apple", "orange", "lime"] + assert luapatt.gsub("x and x and x", "x", lambda _: t.pop(0)) == \ + "apple and orange and lime" + +def test_gsub_asterisk_function(): + t = [] + luapatt.gsub("first second word", "%w%w*", lambda w: t.append(w)) + assert t == ['first', 'second', 'word'] + +def test_gsub_limit_function(): + t = [] + assert luapatt.gsub("first second word", "%w+", lambda w: t.append(w), + 2) == "first second word" + assert t == ['first', 'second'] + +def test_gsub_error_bad_capture_index(): + checkerror(luapatt.PatternSyntaxError, "invalid capture index %2", + luapatt.gsub, "alo", ".", "%2") +def test_gsub_error_percent0_in_pattern(): + checkerror(luapatt.PatternSyntaxError, "invalid capture index %0", + luapatt.gsub, "alo", "(%0)", "a") +def test_gsub_error_backreference_incomplete(): + checkerror(luapatt.PatternSyntaxError, "invalid capture index %1", + luapatt.gsub, "alo", "(%1)", "a") +def test_gsub_bad_percent_in_replacement(): + checkerror(luapatt.PatternSyntaxError, + "invalid '%x' in replacement string", + luapatt.gsub, "alo", ".", "%x") + +# recursive nest of gsubs +def test_gsub_recursion(): + def rev(s): + return luapatt.gsub(s, "(.)(.+)", lambda c, s1: rev(s1) + c) + x = "abcdef" + assert rev(rev(x)) == x + +# gsub with dicts +def test_gsub_empty_dict(): + assert luapatt.gsub("alo alo", ".", {}) == "alo alo" +def test_gsub_dict(): + assert luapatt.gsub("alo alo", "(.)", {'a': "AA", 'l': ""}) == "AAo AAo" +def test_gsub_dict_partial_capture(): + assert luapatt.gsub("alo alo", "(.).", {'a': "AA", 'l': "K"}) == \ + "AAo AAo" +def test_gsub_dict_multi_captures_False(): + assert luapatt.gsub("alo alo", "((.)(.?))", {'al': "AA", 'o': False}) \ + == "AAo AAo" + +def test_gsub_dict_position_captures(): + assert luapatt.gsub("alo alo", "().", {0: 'x', 1: 'yy', 2: 'zzz'}) == \ + "xyyzzz alo" diff --git a/tests/test_lua3_gmfronterror.py b/tests/test_lua3_gmfronterror.py new file mode 100644 index 0000000..8113edd --- /dev/null +++ b/tests/test_lua3_gmfronterror.py @@ -0,0 +1,167 @@ +# coding: utf-8 + +# Copyright 2015 Jonathan Goble +# +# Permission is hereby granted, free of charge, to any person obtaining +# a copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. + +# Copied from the official Lua 5.3.2 test suite and converted to Python + +import sys +sys.path.insert(0, r'src') + +import luapatt + +from helpers import checkerror, syntaxerror + + +### GMATCH + +def test_gmatch_empty_matches(): + a = -1 + for i in luapatt.gmatch('abcde', '()'): + assert i == a + 1 + a = i + assert a == 5 + +def test_gmatch_basic(): + t = [] + for w in luapatt.gmatch("first second word", "%w+"): + t.append(w) + assert t == ["first", "second", "word"] + +def test_gmatch_backreferences(): + t = [2, 5, 8] + for i in luapatt.gmatch("xuxx uu ppar r", "()(.)%2"): + assert i[0] == t.pop(0) + assert len(t) == 0 + +def test_gmatch_multi_captures(): + t = {} + for i, j in luapatt.gmatch("13 14 10 = 11, 15= 16, 22=23", + "(%d+)%s*=%s*(%d+)"): + t[int(i)] = int(j) + for k, v in t.items(): + assert k + 1 == v + 0 + assert len(t) == 3 + + +### FRONTIER PATTERNS + +def test_frontier_class(): + assert luapatt.gsub("aaa aa a aaa a", "%f[%w]a", "x") == "xaa xa x xaa x" +def test_frontier_left_bracket(): + assert luapatt.gsub("[[]] [][] [[[[", "%f[[].", "x") == "x[]] x]x] x[[[" +def test_frontier_class_nullmatch(): + assert luapatt.gsub("01abc45de3", "%f[%d]", ".") == ".01abc.45de.3" +def test_frontier_class_match(): + assert luapatt.gsub("01abc45 de3x", "%f[%D]%w", ".") == "01.bc45 de3." +def test_frontier_negated_nullchar(): + assert luapatt.gsub("function", "%f[^\0]%w", ".") == ".unction" +def test_frontier_nullchar(): + assert luapatt.gsub("function", "%f[\0]", ".") == "function." + +def test_frontier_nullmatch_atstart(): + assert luapatt.find("a", "%f[a]") == (0, 0) +def test_frontier_nullmatch_negated_percent_z(): + assert luapatt.find("a", "%f[^%z]") == (0, 0) +def test_frontier_nullmatch_atend(): + assert luapatt.find("a", "%f[^%l]") == (1, 1) +def test_frontier_nullmatch_inmiddle(): + assert luapatt.find("aba", "%f[a%z]") == (2, 2) +def test_frontier_nullmatch_percent_z(): + assert luapatt.find("aba", "%f[%z]") == (3, 3) +def test_frontier_nomatch(): + assert luapatt.find("aba", "%f[%l%z]") is None +def test_frontier_nomatch_2(): + assert luapatt.find("aba", "%f[^%l%z]") is None + +def test_multi_frontier(): + assert luapatt.find(" alo aalo allo", "%f[%S].-%f[%s].-%f[%S]") == \ + (1, 5) +def test_multi_frontier_2(): + assert luapatt.match(" alo aalo allo", "%f[%S](.-%f[%s].-%f[%S])") == \ + 'alo ' + +def test_frontier_gmatch(): + a = [0, 4, 8, 13, 16] + r = [] + for k in luapatt.gmatch("alo alo th02 is 1hat", "()%f[%w%d]"): + r.append(k) + assert a == r + + +### MALFORMED PATTERN ERRORS + +def test_error_unfinished_capture(): + syntaxerror("(.", "unfinished capture") +def test_error_invalid_right_paren(): + syntaxerror(".)", "unmatched ')'") +def test_error_unfinished_set(): + syntaxerror("[a", "missing ']'") +def test_error_empty_set(): + syntaxerror("[]", "missing ']'") +def test_error_empty_negated_set(): + syntaxerror("[^]", "missing ']'") +def test_error_set_bare_percent(): + syntaxerror("[a%]", "missing ']'") +def test_error_end_with_bare_percent(): + syntaxerror("[a%", "missing ']'") +def test_error_balance_no_args(): + syntaxerror("%b", "missing arguments to '%b')") +def test_error_balance_one_arg(): + syntaxerror("%ba", "missing arguments to '%b')") +def test_error_lone_percent(): + syntaxerror('%', "pattern ends with bare '%'") +def test_error_frontier_no_arg(): + syntaxerror("%f", "missing '[' after '%f'") + + +### STACK OVERFLOW + +def f2(size): + s = "a" * size + p = ".?" * size + return luapatt.match(s, p) + +def test_stack_no_overflow(): + assert len(f2(80)) == 80 + +def test_stack_overflow(): + checkerror(luapatt.PatternStackOverflow, + 'pattern too complex, exceeded recursion limit', + f2, 200000) + + +### BIG STRINGS (these take a few seconds) + +class TestBigStrings: + @classmethod + def setup_class(cls): + cls.a = 'a' * 300000 + def test_big_string_backtrack_one(self): + assert luapatt.find(self.a, '^a*.?$') + def test_big_string_no_match(self): + assert not luapatt.find(self.a, '^a*.?b$') + def test_big_string_minus(self): + assert luapatt.find(self.a, '^a-.?$') + @classmethod + def teardown_class(cls): + del cls.a + + diff --git a/tests/test_lua4_sets.py b/tests/test_lua4_sets.py new file mode 100644 index 0000000..d6f683c --- /dev/null +++ b/tests/test_lua4_sets.py @@ -0,0 +1,78 @@ +# coding: utf-8 + +# Copyright 2015 Jonathan Goble +# +# Permission is hereby granted, free of charge, to any person obtaining +# a copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. + +# Copied from the official Lua 5.3.2 test suite and converted to Python + +import sys +sys.path.insert(0, r'src') + +import luapatt + + +### SETS + +### This class takes several minutes to run, so be patient. test_percent_z and +### test_dot are responsible for two-thirds of that time. + +class TestSets: + @classmethod + def setup_class(cls): + cls.abc = ''.join(map(chr, range(sys.maxunicode + 1))) + assert len(cls.abc) == sys.maxunicode + 1 + + def strset(self, p): + result = set() + def record(char): + result.add(char) + luapatt.gsub(self.abc, p, record) + return result + + def test_hex_range_set(self): + assert len(self.strset('[\xc0-\xd0]')) == 17 + def test_range_set(self): + assert self.strset('[a-z]') == set("abcdefghijklmnopqrstuvwxyz") + def test_range_and_class_set(self): + assert self.strset('[a-z%d]') == self.strset('[%da-uu-z]') + def test_dash_at_end_of_set(self): + assert self.strset('[a-]') == set("-a") + def test_negated_set(self): + assert self.strset('[^%W]') == self.strset('[%w]') + def test_right_bracket_percent_set(self): + assert self.strset('[]%%]') == set('%]') + def test_escaped_dash_in_set(self): + assert self.strset('[a%-z]') == set('-az') + def test_escapes_in_set(self): + assert self.strset('[%^%[%-a%]%-b]') == set('-[]^ab') + def test_percent_z(self): + assert self.strset('%Z') == \ + self.strset('[\u0001-{}]'.format(chr(sys.maxunicode))) + def test_dot(self): + assert self.strset('.') == \ + self.strset('[\u0001-{}%z]'.format(chr(sys.maxunicode))) + + # Custom tests: + def test_percent_u(self): + assert self.strset('%u') == set(c for c in self.abc if c.isupper()) + + @classmethod + def teardown_class(cls): + del cls.abc