diff --git a/weld-python/tests/grizzly/test_series.py b/weld-python/tests/grizzly/core/test_series.py similarity index 98% rename from weld-python/tests/grizzly/test_series.py rename to weld-python/tests/grizzly/core/test_series.py index 3cece7fba..351dfc8ee 100644 --- a/weld-python/tests/grizzly/test_series.py +++ b/weld-python/tests/grizzly/core/test_series.py @@ -6,7 +6,7 @@ import numpy as np import pandas as pd import pytest -import weld.grizzly.series as gr +import weld.grizzly as gr types_ = ['int8', 'uint8', 'int16', 'uint16', 'int32',\ 'uint32', 'int64', 'uint64', 'float32', 'float64'] @@ -134,7 +134,7 @@ def test_indexing(): def test_unsupported_binop_error(): # Test unsupported - from weld.grizzly.error import GrizzlyError + from weld.grizzly.core.error import GrizzlyError with pytest.raises(GrizzlyError): a = gr.GrizzlySeries([1,2,3]) b = pd.Series([1,2,3]) diff --git a/weld-python/tests/grizzly/core/test_strings.py b/weld-python/tests/grizzly/core/test_strings.py new file mode 100644 index 000000000..e760c1cd0 --- /dev/null +++ b/weld-python/tests/grizzly/core/test_strings.py @@ -0,0 +1,117 @@ +""" +Test string functionality. + +The behavior is tested against Pandas unless noted otherwise. + +""" + +import numpy as np +import pandas as pd +import pytest +import weld.grizzly as gr + +# To check whether the output is a string. +# TODO(shoumik): There should be a better way to do this, another reason +# to use ExtensionArray and a custom dtype for Weldified string arrays. +from weld.types import WeldVec, I8 + +def compare_vs_pandas(func, strings, *args, **kwargs): + pandas_series = pd.Series(strings) + grizzly_series = gr.GrizzlySeries(strings) + + pandas_result = getattr(pandas_series.str, func)(*args, **kwargs) + grizzly_result = getattr(grizzly_series.str, func)(*args, **kwargs) + if grizzly_result.output_type.elem_type != WeldVec(I8()): + grizzly_result = grizzly_result.to_pandas() + else: + # Perform UTF-8 decoding. + grizzly_result = grizzly_result.str.to_pandas() + assert pandas_result.equals(grizzly_result) + +# Strings to test capitalization functions. +capitals_strings = [ + "hello", "HELLO", "LonGHelLO", "", + "3.141592, it's pi!", "many words in this one"] + +def test_lower(): + compare_vs_pandas('lower', capitals_strings) + +def test_upper(): + compare_vs_pandas('upper', capitals_strings) + +def test_capitalize(): + compare_vs_pandas('capitalize', capitals_strings) + +def test_get(): + """ + Behavior of get is different in Grizzly -- it currently returns empty strings + in cases where Pandas returns NaN. This will be changed in a later patch. + + """ + inp = ["hello", "world", "test", "me", ''] + expect = ['l', 'l', 't', '', ''] + grizzly_result = gr.GrizzlySeries(inp).str.get(3).str.to_pandas() + pandas_result = pd.Series(expect) + assert pandas_result.equals(grizzly_result) + + expect = ['o', 'd', 't', 'e', ''] + grizzly_result = gr.GrizzlySeries(inp).str.get(-1).str.to_pandas() + pandas_result = pd.Series(expect) + assert pandas_result.equals(grizzly_result) + + expect = ['', '', '', '', ''] + grizzly_result = gr.GrizzlySeries(inp).str.get(-50).str.to_pandas() + pandas_result = pd.Series(expect) + assert pandas_result.equals(grizzly_result) + +def test_strip(): + compare_vs_pandas('strip', ["", + " hi ", + "\t\thi\n", + """ + + hello + + """, + " \t goodbye", + "goodbye again ", + " \n hi \n bye \n ", + """ + + hi + + bye + + """]) + +def test_contains(): + compare_vs_pandas('contains', ["abc", "abcdefg", "gfedcbaabcabcdef", ""], "abc") + +def test_startswith(): + compare_vs_pandas('startswith', ["abc", "abcdefg", "gfedcba", "", "defabc"], "abc") + +def test_endswith(): + compare_vs_pandas('endswith', ["abc", "abcdefg", "gfedabc", "", "defabc"], "abc") + +def test_find(): + compare_vs_pandas('find', ["abc", "abcdefg", "gfedcbaabcabcdef", ""], "abc") + compare_vs_pandas('find', ["abc", "abcdefg", "gfedcbaabcabcdef", ""], "abc", 2) + compare_vs_pandas('find', ["abc", "abcdefg", "gfedcbaabcabcdef", ""], "abc", 3) + compare_vs_pandas('find', ["abc", "abcdefg", "gfedcbaabcabcdef", ""], "abc", end=2) + compare_vs_pandas('find', ["abc", "abcdefg", "gfedcbaabcabcdef", ""], "abc", end=3) + compare_vs_pandas('find', ["abc", "abcdefg", "gfedcbaabcabcdef", ""], "abc", 3, end=7) + compare_vs_pandas('find', ["abc", "abcdefg", "gfedcbaabcabcdef", ""], "abc", 100, end=105) + +def test_replace(): + """ + Behavior of replace is different in Grizzly -- it currently only replaces the *first* + occurrance. This will be changed in a later patch. + + """ + import copy + inp = ["abc", "abcdefg", "abcabcabc", "gfedcbaabcabcdef", "", "XYZ"] + expect = [s.replace("abc", "XYZ", 1) for s in copy.copy(inp)] + grizzly_result = gr.GrizzlySeries(inp).str.replace("abc", "XYZ").str.to_pandas() + pandas_result = pd.Series(expect) + assert pandas_result.equals(grizzly_result) + diff --git a/weld-python/weld/compile.py b/weld-python/weld/compile.py index b5bb436ba..160048c79 100644 --- a/weld-python/weld/compile.py +++ b/weld-python/weld/compile.py @@ -81,7 +81,7 @@ def compile(program, arg_types, encoders, restype, decoder, conf=None): >>> from weld.types import * >>> func = compile("|x: i32| x + 1", ... [I32()], [None], - ... I32(), None) + ... I32(), None) ... >>> func(100)[0] 101 @@ -92,7 +92,7 @@ def compile(program, arg_types, encoders, restype, decoder, conf=None): >>> func = compile("|x: i32, y: i32| x + y", ... [I32(), I32()], [None, None], - ... I32(), None) + ... I32(), None) ... >>> func(5, 6)[0] 11 @@ -101,7 +101,7 @@ def compile(program, arg_types, encoders, restype, decoder, conf=None): >>> func = compile("|x: i32| x + 1", ... [I32()], [PrimitiveWeldEncoder()], - ... I32(), PrimitiveWeldDecoder()) + ... I32(), PrimitiveWeldDecoder()) ... >>> func(100)[0] 101 @@ -158,6 +158,7 @@ def func(*args, context=None): raw_args_pointer = ctypes.addressof(raw_args) value = WeldValue(raw_args_pointer) + if context is None: context = WeldContext(conf) diff --git a/weld-python/weld/conftest.py b/weld-python/weld/conftest.py new file mode 100644 index 000000000..acbd85af3 --- /dev/null +++ b/weld-python/weld/conftest.py @@ -0,0 +1,15 @@ + +import pytest + +import numpy as np +import pandas as pd +import weld.grizzly as gr + +@pytest.fixture(autouse=True) +def add_imports(doctest_namespace): + """ + Make `gr`, `np`, and `pd available for doctests. + """ + doctest_namespace["np"] = np + doctest_namespace["pd"] = pd + doctest_namespace["gr"] = gr diff --git a/weld-python/weld/encoders/numpy.py b/weld-python/weld/encoders/numpy.py index 2f79593b6..23852f946 100644 --- a/weld-python/weld/encoders/numpy.py +++ b/weld-python/weld/encoders/numpy.py @@ -268,7 +268,7 @@ class StringConversionFuncs(object): """ stringfuncs = ctypes.PyDLL(weld.encoders._strings.__file__) - string_cclass = WeldVec(I8()).ctype_class + string_cclass = WeldVec(WeldVec(I8())).ctype_class @staticmethod def numpy_string_array_to_weld(arr): diff --git a/weld-python/weld/grizzly/__init__.py b/weld-python/weld/grizzly/__init__.py index e69de29bb..963902a18 100644 --- a/weld-python/weld/grizzly/__init__.py +++ b/weld-python/weld/grizzly/__init__.py @@ -0,0 +1,2 @@ + +from weld.grizzly.core.series import GrizzlySeries diff --git a/weld-python/weld/grizzly/core/__init__.py b/weld-python/weld/grizzly/core/__init__.py new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/weld-python/weld/grizzly/core/__init__.py @@ -0,0 +1 @@ + diff --git a/weld-python/weld/grizzly/error.py b/weld-python/weld/grizzly/core/error.py similarity index 100% rename from weld-python/weld/grizzly/error.py rename to weld-python/weld/grizzly/core/error.py diff --git a/weld-python/weld/grizzly/series.py b/weld-python/weld/grizzly/core/series.py similarity index 92% rename from weld-python/weld/grizzly/series.py rename to weld-python/weld/grizzly/core/series.py index fd7245358..0b5b2b124 100644 --- a/weld-python/weld/grizzly/series.py +++ b/weld-python/weld/grizzly/core/series.py @@ -8,19 +8,18 @@ import warnings import weld.encoders.numpy as wenp +import weld.grizzly.weld.str as weldstr -from pandas.core.internals import SingleBlockManager from weld.lazy import PhysicalValue, WeldLazy, WeldNode, identity +from weld.grizzly.weld.ops import * +from weld.grizzly.core.error import GrizzlyError +from weld.grizzly.core.strings import StringMethods from weld.types import * -from .weld.ops import * -from .error import * - def _grizzlyseries_constructor_with_fallback(data=None, **kwargs): """ A flexible constructor for Series._constructor, which needs to be able - to fall back to a Series (if a certain operation does not produce - geometries) + to fall back to a Series (if a certain operation cannot produce GrizzlySeries). """ try: return GrizzlySeries(data=data, **kwargs) @@ -114,7 +113,7 @@ def values(self): >>> x.values Traceback (most recent call last): ... - weld.grizzly.error.GrizzlyError: GrizzlySeries is not evaluated and does not have values. Try calling 'evaluate()' first. + weld.grizzly.core.error.GrizzlyError: GrizzlySeries is not evaluated and does not have values. Try calling 'evaluate()' first. """ if not self.is_value: raise GrizzlyError("GrizzlySeries is not evaluated and does not have values. Try calling 'evaluate()' first.") @@ -212,7 +211,10 @@ def _supports_grizzly(cls, data): """ if not isinstance(data, np.ndarray) or data.ndim != 1: return None - elem_type = wenp.dtype_to_weld_type(data.dtype) + if data.dtype.char == 'S': + elem_type = WeldVec(I8()) + else: + elem_type = wenp.dtype_to_weld_type(data.dtype) return WeldVec(elem_type) if elem_type is not None else None # ---------------------- Initialization ------------------------------ @@ -232,13 +234,13 @@ def __new__(cls, data, dtype=None, index=None, **kwargs): 2 3 dtype: int64 >>> x.__class__ - + >>> x = GrizzlySeries(np.ones(5)) >>> x.__class__ - - >>> y = GrizzlySeries(['hi', 'bye']) # Unsupported + + >>> y = GrizzlySeries(['hi', 'bye']) >>> y.__class__ - + >>> y = GrizzlySeries([1, 2, 3], index=[1, 0, 2]) # Unsupported >>> y.__class__ @@ -246,20 +248,30 @@ def __new__(cls, data, dtype=None, index=None, **kwargs): s = None if isinstance(data, WeldLazy): self = super(GrizzlySeries, cls).__new__(cls) - super(GrizzlySeries, self).__init__(None, dtype=dtype, **kwargs) + super(GrizzlySeries, self).__init__(np.array([], dtype=dtype), **kwargs) self.weld_value_ = data return self - elif index is not None and not isinstance(index, pd.RangeIndex): + + if index is not None and not isinstance(index, pd.RangeIndex): # TODO(shoumik): This is probably incomplete, since we could have a # RangeIndex that does not capture the full span of the data, has a # non-zero step, etc. return pd.Series(data, dtype=dtype, index=index, **kwargs) - elif len(kwargs) != 0: + + if len(kwargs) != 0: + # Unsupported arguments present: bail for now. return pd.Series(data, dtype=dtype, index=index, **kwargs) + + if isinstance(data, list) and len(data) > 0 and isinstance(data[0], str): + # Try to convert a list of strings into a supported Numpy array. + data = np.array(data, dtype='S') + + if isinstance(data, pd.Series): + data = data.values elif not isinstance(data, np.ndarray): # First, convert the input into a Series backed by an ndarray. - s = pd.Series(data, dtype=dtype, index=index, **kwargs) - data = s.values + s = pd.Series(data, dtype=dtype, index=index, **kwargs) + data = s.values # Try to create a Weld type for the input. weld_type = GrizzlySeries._supports_grizzly(data) @@ -270,9 +282,17 @@ def __new__(cls, data, dtype=None, index=None, **kwargs): PhysicalValue(data, weld_type, GrizzlySeries._encoder), GrizzlySeries._decoder) return self + # Don't re-convert values if we did it once already -- it's expensive. return s if s is not None else pd.Series(data, dtype=dtype, index=index, **kwargs) + # ---------------------- StringMethods ------------------------------ + + @property + def str(self): + # TODO(shoumik.palkar): Use pandas.core.accessor.CachedAccessor? + return StringMethods(self) + # ---------------------- Indexing ------------------------------ def __setitem__(self, key, value): diff --git a/weld-python/weld/grizzly/core/strings.py b/weld-python/weld/grizzly/core/strings.py new file mode 100644 index 000000000..e603a8cc6 --- /dev/null +++ b/weld-python/weld/grizzly/core/strings.py @@ -0,0 +1,252 @@ +""" +String methods supported by Series. + +""" + +import weld.encoders.numpy as wenp +import weld.grizzly.weld.str as weldstr + +from weld.types import * + +class StringMethods(object): + """ + String methods for Grizzly. Currently, string methods only apply to ASCII + strings; while users can pass UTF-8 strings into Grizzly, their codepoints + will be ignored by the below operations and will be returned unmodified. + + """ + + __slots__ = [ "series", "constructor" ] + + def __init__(self, series): + if series.dtype.char != 'S': + raise ValueError("StringMethods only available for Series with dtype 'S'") + self.series = series + # TODO(shoumik): This is a hack: we should define an abstract class that captures + # the interface additional functionality needs. + self.constructor = self.series.__class__ + + def to_pandas(self): + """ + Convert an array of strings to a Pandas series. + + We provide a specialized implementation of `to_pandas` here that will perform UTF-8 decoding + of the raw bytestrings that Grizzly series operate over. + + Examples + -------- + >>> x = gr.GrizzlySeries(["Welcome", "to", "Grizzly!"]) + >>> x + 0 b'Welcome' + 1 b'to' + 2 b'Grizzly!' + dtype: bytes64 + >>> x.str.to_pandas() + 0 Welcome + 1 to + 2 Grizzly! + dtype: object + + """ + return self.series.evaluate().to_pandas().str.decode("utf-8") + + def _apply(self, func, *args, return_weld_elem_type=None): + """ + Apply the given weldfunc to `self.series` and return a new GrizzlySeries. + + If the return type of the result is not a string GrizzlySeries, pass + 'return_weld_elem_type' to specify the element type of the result. + + """ + output_type = self.series.output_type if return_weld_elem_type is None else WeldVec(return_weld_elem_type) + dtype = 'S' if return_weld_elem_type is None else wenp.weld_type_to_dtype(return_weld_elem_type) + lazy = func(self.series.weld_value_, *args)(output_type, self.constructor._decoder) + return (self.constructor)(lazy, dtype=dtype) + + def lower(self): + """ + Lowercase strings. + + Examples + -------- + >>> x = gr.GrizzlySeries(["HELLO", "WorLD"]) + >>> x.str.lower().str.to_pandas() + 0 hello + 1 world + dtype: object + + """ + return self._apply(weldstr.lower) + + def upper(self): + """ + Uppercase strings. + + Examples + -------- + >>> x = gr.GrizzlySeries(["hello", "WorlD"]) + >>> x.str.upper().str.to_pandas() + 0 HELLO + 1 WORLD + dtype: object + + """ + return self._apply(weldstr.upper) + + def capitalize(self): + """ + Capitalize the first character in each string. + + Examples + -------- + >>> x = gr.GrizzlySeries(["hello", "worlD"]) + >>> x.str.capitalize().str.to_pandas() + 0 Hello + 1 World + dtype: object + + """ + return self._apply(weldstr.capitalize) + + def get(self, index): + """ + Get the character at index 'i' from each string. If 'index' is greater than + the string length, this returns an empty string. If 'index' is less than 0, + this wraps around, using Python's indexing behavior. + + Examples + -------- + >>> x = gr.GrizzlySeries(["hello", "worlD"]) + >>> x.str.get(4).str.to_pandas() + 0 o + 1 D + dtype: object + >>> x.str.get(-3).str.to_pandas() + 0 l + 1 r + dtype: object + + """ + return self._apply(weldstr.get, index) + + def strip(self): + """ + Strip whitespace from the string. + + Examples + -------- + >>> x = gr.GrizzlySeries([" hello ", " world \t "]) + >>> x.str.strip().str.to_pandas() + 0 hello + 1 world + dtype: object + + """ + return self._apply(weldstr.strip) + + def contains(self, pat): + """ + Returns whether each string contains the provided pattern. + + Pattern must be a Python string. + + Examples + -------- + >>> x = gr.GrizzlySeries(["hello", "world"]) + >>> x.str.contains('wor').evaluate() + 0 False + 1 True + dtype: bool + + """ + if not isinstance(pat, str): + raise TypeError("pattern in contains must be a Python 'str'") + return self._apply(weldstr.contains, pat, return_weld_elem_type=Bool()) + + def startswith(self, pat): + """ + Returns whether each string starts with the provided pattern. + + Pattern must be a Python string. + + Examples + -------- + >>> x = gr.GrizzlySeries(["hello", "world"]) + >>> x.str.startswith('wo').evaluate() + 0 False + 1 True + dtype: bool + + """ + if not isinstance(pat, str): + raise TypeError("pattern in startswith must be a Python 'str'") + return self._apply(weldstr.startswith, pat, return_weld_elem_type=Bool()) + + def endswith(self, pat): + """ + Returns whether each string starts with the provided pattern. + + Pattern must be a Python string. + + Examples + -------- + >>> x = gr.GrizzlySeries(["hello", "world"]) + >>> x.str.endswith('rld').evaluate() + 0 False + 1 True + dtype: bool + + """ + if not isinstance(pat, str): + raise TypeError("pattern in endswith must be a Python 'str'") + return self._apply(weldstr.endswith, pat, return_weld_elem_type=Bool()) + + def find(self, sub, start=0, end=None): + """ + Find 'sub' in each string. Each string is searched in the range [start,end). + + 'sub' must be a Python string, and 'start' and 'end' must be Python integers. + + Examples + -------- + >>> x = gr.GrizzlySeries(["bigfatcat", "fatcatbig", "reallybigcat"]) + >>> x.str.find('fat').evaluate() + 0 3 + 1 0 + 2 -1 + dtype: int64 + >>> x.str.find('big', end=2).evaluate() + 0 -1 + 1 -1 + 2 -1 + dtype: int64 + + """ + if not isinstance(sub, str): + raise TypeError("sub in find must be a Python 'str'") + if not isinstance(start, int): + raise TypeError("start in find must be a Python 'int'") + if end is not None and not isinstance(end, int): + raise TypeError("end in find must be a Python 'int'") + return self._apply(weldstr.find, sub, start, end, return_weld_elem_type=I64()) + + def replace(self, pat, rep): + """ + Replaces the first occurrence of 'pat' with 'rep' in each string. + + Pattern and replacement must be Python strings. + + Examples + -------- + >>> x = gr.GrizzlySeries(["hello", "world"]) + >>> x.str.replace('o', 'lalala').str.to_pandas() + 0 helllalala + 1 wlalalarld + dtype: object + + """ + if not isinstance(pat, str): + raise TypeError("pattern in replace must be a Python 'str'") + if not isinstance(rep, str): + raise TypeError("replacement in replace must be a Python 'str'") + return self._apply(weldstr.replace, pat, rep) diff --git a/weld-python/weld/grizzly/weld/str.py b/weld-python/weld/grizzly/weld/str.py new file mode 100644 index 000000000..7f7622a3d --- /dev/null +++ b/weld-python/weld/grizzly/weld/str.py @@ -0,0 +1,374 @@ +""" +String functions exported as weldfuncs. + +Each function takes an argument representing an array of strings, and outputs a +program that applies some transformation on each string. The functions are annotated +with `weld.lazy.weldfunc`, so they accept `WeldLazy` objects and return functions +for constructing Weld programs. + +These are adapted from +https://github.com/weld-project/baloo/blob/master/baloo/weld/weld_str.py. + +We may choose to re-implement these as UDF calls to Rust's UTF-8 library in the future. + +""" + +import weld.lazy + +def string_to_weld_literal(s): + """ + Converts a string to a UTF-8 encoded Weld literal byte-vector. + + Examples + -------- + >>> string_to_weld_literal('hello') + '[104c,101c,108c,108c,111c]' + + """ + return "[" + ",".join([str(b) + 'c' for b in list(s.encode('utf-8'))]) + "]" + +@weld.lazy.weldfunc +def lower(stringarr): + """ + Convert values to lowercase. + + """ + return """map( + {stringarr}, + |e: vec[i8]| + result( + for(e, + appender[i8], + |c: appender[i8], j: i64, f: i8| + if(f > 64c && f < 91c, + merge(c, f + 32c), + merge(c, f)) + ) + ) + )""".format(stringarr=stringarr) + + +@weld.lazy.weldfunc +def upper(stringarr): + """ + Convert values to uppercase. + + """ + return """map( + {stringarr}, + |e: vec[i8]| + result( + for(e, + appender[i8], + |c: appender[i8], j: i64, f: i8| + if(f > 96c && f < 123c, + merge(c, f - 32c), + merge(c, f)) + ) + ) + )""".format(stringarr=stringarr) + +@weld.lazy.weldfunc +def capitalize(stringarr): + """ + Capitalize first letter. + + """ + return """map( + {stringarr}, + |e: vec[i8]| + let lenString = len(e); + if(lenString > 0L, + let res = appender[i8]; + let firstChar = lookup(e, 0L); + let res = if(firstChar > 96c && firstChar < 123c, merge(res, firstChar - 32c), merge(res, firstChar)); + result( + for(slice(e, 1L, lenString - 1L), + res, + |c: appender[i8], j: i64, f: i8| + if(f > 64c && f < 91c, + merge(c, f + 32c), + merge(c, f) + ) + ) + ), + e) + )""".format(stringarr=stringarr) + + +@weld.lazy.weldfunc +def get(stringarr, i): + """ + Retrieves the character at index 'i'. + + If 'i' is greater than the string length, returns '\0'. + + """ + i = "i64({})".format(i) + return """map( + {stringarr}, + |e: vec[i8]| + let lenString = len(e); + if({i} >= lenString, + [0c], + if({i} > 0L, + result(merge(appender[i8], lookup(slice(e, 0L, lenString), {i}))), + if ({i} > -lenString, + result(merge(appender[i8], lookup(slice(e, lenString, {i}), {i}))), + [0c] + ) + ) + ) + )""".format(stringarr=stringarr, i=i) + + +@weld.lazy.weldfunc +def strip(stringarr): + """ + Strip whitespace from the start of each string. + + """ + # From https://en.wikipedia.org/wiki/Whitespace_character. + is_whitespace = "((lookup(e, p) == 32c) || (lookup(e, p) >= 9c && lookup(e, p) <= 13c))" + # +3L = +1 compensate start_i already +1'ed, +1 compensate end_i already -1'ed, +1 compensate for slice with size + return """map( + {stringarr}, + |e: vec[i8]| + let lenString = len(e); + let start_i = iterate(0L, |p| {{p + 1L, p < lenString && {is_whitespace}}}); + let end_i = iterate(lenString - 1L, |p| {{p - 1L, p > start_i && {is_whitespace}}}); + slice(e, start_i - 1L, end_i - start_i + 3L) + )""".format(stringarr=stringarr, is_whitespace=is_whitespace) + +@weld.lazy.weldfunc +def contains(stringarr, pat): + """ + Check whether each element contains the substring 'pat', and returns + a boolean array of the results. + + For now, 'pat' must be a string literal. + """ + define_pat = "let pat = {};".format(string_to_weld_literal(pat)) + + return """ + {define_pat} + let lenPat = len(pat); + map({stringarr}, + |e: vec[i8]| + let lenString = len(e); + if(lenPat > lenString, + false, + # start by assuming pat is not found, until proven it is + let words_iter_res = iterate({{0L, false}}, + |p| + let e_i = p.$0; + let pat_i = 0L; + # start by assuming the substring and pat are the same, until proven otherwise + let word_check_res = iterate({{e_i, pat_i, true}}, + |q| + let found = lookup(e, q.$0) == lookup(pat, q.$1); + {{ + {{q.$0 + 1L, q.$1 + 1L, found}}, + q.$1 + 1L < lenPat && + found == true + }} + ).$2; + {{ + {{p.$0 + 1L, word_check_res}}, + p.$0 + lenPat < lenString && + word_check_res == false + }} + ).$1; + words_iter_res + ) + )""".format(stringarr=stringarr, define_pat=define_pat) + +@weld.lazy.weldfunc +def startswith(stringarr, pat): + """ + Check whether each element starts with the substring 'pat', and returns + a boolean array of the results. + + For now, 'pat' must be a string literal. + + """ + define_pat = "let pat = {};".format(string_to_weld_literal(pat)) + + return """ + {define_pat} + let lenPat = len(pat); + map({stringarr}, + |e: vec[i8]| + let lenString = len(e); + if(lenPat > lenString, + false, + iterate({{0L, true}}, + |q| + let found = lookup(e, q.$0) == lookup(pat, q.$0); + {{ + {{q.$0 + 1L, found}}, + q.$0 + 1L < lenPat && + found == true + }} + ).$1 + ) + )""".format(stringarr=stringarr, define_pat=define_pat) + + +@weld.lazy.weldfunc +def endswith(stringarr, pat): + """ + Check whether each element ends with the substring 'pat', and returns + a boolean array of the results. + + For now, 'pat' must be a string literal. + + """ + define_pat = "let pat = {};".format(string_to_weld_literal(pat)) + + return """ + {define_pat} + let lenPat = len(pat); + map({stringarr}, + |e: vec[i8]| + let lenString = len(e); + if(lenPat > lenString, + false, + iterate({{lenString - lenPat, 0L, true}}, + |q| + let found = lookup(e, q.$0) == lookup(pat, q.$1); + {{ + {{q.$0 + 1L, q.$1 + 1L, found}}, + q.$1 + 1L < lenPat && + found == true + }} + ).$2 + ) + )""".format(stringarr=stringarr, define_pat=define_pat) + +@weld.lazy.weldfunc +def find(stringarr, sub, start, end=None): + """ + Searches for 'sub' in each string in the range 'start', 'end'. Returns a + i64 array with -1 for unfound strings, or the index of the found string. + + 'sub' must be a Python string. 'start' and 'end' must be integers. + + """ + + start = "i64({})".format(start) + if end is None: + end = 'len(e)' + else: + end = "i64({})".format(end) + + define_sub = "let sub = {};".format(string_to_weld_literal(sub)) + + return """ + {define_sub} + let lenSub = len(sub); + map({stringarr}, + |e: vec[i8]| + let start = {start}; + let size = {end} - start; + if (start < 0L, + -1L, + let string = slice(e, start, size); + let lenString = len(string); + if(lenSub > lenString, + -1L, + # start by assuming sub is not found, until proven it is + let words_iter_res = iterate({{0L, false}}, + |p| + let e_i = p.$0; + let pat_i = 0L; + # start by assuming the substring and sub are the same, until proven otherwise + let word_check_res = iterate({{e_i, pat_i, true}}, + |q| + let found = lookup(string, q.$0) == lookup(sub, q.$1); + {{ + {{q.$0 + 1L, q.$1 + 1L, found}}, + q.$1 + 1L < lenSub && + found == true + }} + ).$2; + {{ + {{p.$0 + 1L, word_check_res}}, + p.$0 + lenSub < lenString && + word_check_res == false + }} + ); + if(words_iter_res.$1 == true, + words_iter_res.$0 - 1L + start, + -1L + ) + ) + ) + )""".format(stringarr=stringarr, define_sub=define_sub, start=start, end=end) + + +@weld.lazy.weldfunc +def replace(stringarr, pat, rep): + """ + Replace the first occurrence iof 'pat' in each string with with 'rep'. + + For now, 'pat' and 'rep' must be Python strings. + + """ + define_pat = "let pat = {};".format(string_to_weld_literal(pat)) + define_rep = "let rep = {};".format(string_to_weld_literal(rep)) + + return """ + {define_pat} + {define_rep} + let lenPat = len(pat); + map({stringarr}, + |e: vec[i8]| + let lenString = len(e); + if(lenPat > lenString, + e, + # start by assuming sub is not found, until proven it is + let words_iter_res = iterate({{0L, false}}, + |p| + let e_i = p.$0; + let pat_i = 0L; + # start by assuming the substring and sub are the same, until proven otherwise + let word_check_res = iterate({{e_i, pat_i, true}}, + |q| + let found = lookup(e, q.$0) == lookup(pat, q.$1); + {{ + {{q.$0 + 1L, q.$1 + 1L, found}}, + q.$1 + 1L < lenPat && + found == true + }} + ).$2; + {{ + {{p.$0 + 1L, word_check_res}}, + p.$0 + lenPat < lenString && + word_check_res == false + }} + ); + if(words_iter_res.$1 == true, + let rep_from = words_iter_res.$0 - 1L; + let rep_to = rep_from + lenPat; + let res = appender[i8]; + let res = for(slice(e, 0L, rep_from), + res, + |c: appender[i8], j: i64, f: i8| + merge(c, f) + ); + let res = for(rep, + res, + |c: appender[i8], j: i64, f: i8| + merge(c, f) + ); + let res = for(slice(e, rep_to, lenString), + res, + |c: appender[i8], j: i64, f: i8| + merge(c, f) + ); + result(res), + e + ) + ) + )""".format(stringarr=stringarr, define_pat=define_pat, define_rep=define_rep) diff --git a/weld/src/lib.rs b/weld/src/lib.rs index cbfd717ff..4c267a1e6 100644 --- a/weld/src/lib.rs +++ b/weld/src/lib.rs @@ -713,7 +713,7 @@ impl WeldModule { use crate::sir::optimizations; info!("Applying SIR optimizations"); optimizations::fold_constants::fold_constants(&mut sir_prog)?; - optimizations::simplify_assignments::simplify_assignments(&mut sir_prog)?; + // optimizations::simplify_assignments::simplify_assignments(&mut sir_prog)?; } let end = PreciseTime::now(); debug!("Optimized SIR program:\n{}\n", &sir_prog);