From b781d94e8f7c85b32373ff33b77bb56cfc04c20b Mon Sep 17 00:00:00 2001 From: Shoumik Palkar Date: Tue, 17 Mar 2020 18:53:11 -0700 Subject: [PATCH 1/9] start string integration --- weld-python/weld/grizzly/series.py | 46 +++++++++++++++++++++++++----- 1 file changed, 39 insertions(+), 7 deletions(-) diff --git a/weld-python/weld/grizzly/series.py b/weld-python/weld/grizzly/series.py index fd7245358..db0080c73 100644 --- a/weld-python/weld/grizzly/series.py +++ b/weld-python/weld/grizzly/series.py @@ -8,11 +8,11 @@ import warnings import weld.encoders.numpy as wenp +import weld.grizzly.weld.str as weldstr from pandas.core.internals import SingleBlockManager from weld.lazy import PhysicalValue, WeldLazy, WeldNode, identity from weld.types import * - from .weld.ops import * from .error import * @@ -27,6 +27,23 @@ def _grizzlyseries_constructor_with_fallback(data=None, **kwargs): except TypeError: return pd.Series(data=data, **kwargs) +class StringMethods(object): + """ + String methods for Grizzly Series. + + """ + + def __init__(self, series): + # TODO(soumik.palkar): This probably needs to take an extension array, because + # Pandas doesn't like the 'S' dtype. + if series.dtype.char != 'S': + raise ValueError("StringMethods only available for Series with dtype 'S'") + self.series = series + + def lower(self): + lazy = weldstr.lower(self.weld_value_)(self.output_type, GrizzlySeries._decoder) + return GrizzlySeries(lazy, dtype=self.dtype) + class GrizzlySeries(pd.Series): """ A lazy `Series` object backed by a Weld computation. @@ -212,7 +229,10 @@ def _supports_grizzly(cls, data): """ if not isinstance(data, np.ndarray) or data.ndim != 1: return None - elem_type = wenp.dtype_to_weld_type(data.dtype) + if data.dtype.char == 'S': + elem_type = WeldVec(I8()) + else: + elem_type = wenp.dtype_to_weld_type(data.dtype) return WeldVec(elem_type) if elem_type is not None else None # ---------------------- Initialization ------------------------------ @@ -249,17 +269,21 @@ def __new__(cls, data, dtype=None, index=None, **kwargs): super(GrizzlySeries, self).__init__(None, dtype=dtype, **kwargs) self.weld_value_ = data return self - elif index is not None and not isinstance(index, pd.RangeIndex): + + if index is not None and not isinstance(index, pd.RangeIndex): # TODO(shoumik): This is probably incomplete, since we could have a # RangeIndex that does not capture the full span of the data, has a # non-zero step, etc. return pd.Series(data, dtype=dtype, index=index, **kwargs) - elif len(kwargs) != 0: + + if len(kwargs) != 0: + # Unsupported arguments present: bail for now. return pd.Series(data, dtype=dtype, index=index, **kwargs) - elif not isinstance(data, np.ndarray): + + if not isinstance(data, np.ndarray): # First, convert the input into a Series backed by an ndarray. - s = pd.Series(data, dtype=dtype, index=index, **kwargs) - data = s.values + s = pd.Series(data, dtype=dtype, index=index, **kwargs) + data = s.values # Try to create a Weld type for the input. weld_type = GrizzlySeries._supports_grizzly(data) @@ -270,9 +294,17 @@ def __new__(cls, data, dtype=None, index=None, **kwargs): PhysicalValue(data, weld_type, GrizzlySeries._encoder), GrizzlySeries._decoder) return self + # Don't re-convert values if we did it once already -- it's expensive. return s if s is not None else pd.Series(data, dtype=dtype, index=index, **kwargs) + # ---------------------- StringMethods ------------------------------ + + @property + def str(self): + # TODO(shoumik.palkar): Use pandas.core.accessor.CachedAccessor? + return StringMethods(self) + # ---------------------- Indexing ------------------------------ def __setitem__(self, key, value): From 23802980a4ea50d1dd281f181cd15bf1d3186d52 Mon Sep 17 00:00:00 2001 From: Shoumik Palkar Date: Wed, 18 Mar 2020 16:44:08 -0700 Subject: [PATCH 2/9] more string stuff --- weld-python/weld/compile.py | 6 +-- weld-python/weld/encoders/numpy.py | 2 +- weld-python/weld/grizzly/series.py | 80 +++++++++++++++++++++++++--- weld-python/weld/grizzly/weld/str.py | 79 +++++++++++++++++++++++++++ 4 files changed, 155 insertions(+), 12 deletions(-) create mode 100644 weld-python/weld/grizzly/weld/str.py diff --git a/weld-python/weld/compile.py b/weld-python/weld/compile.py index b5bb436ba..8ef0586d0 100644 --- a/weld-python/weld/compile.py +++ b/weld-python/weld/compile.py @@ -81,7 +81,7 @@ def compile(program, arg_types, encoders, restype, decoder, conf=None): >>> from weld.types import * >>> func = compile("|x: i32| x + 1", ... [I32()], [None], - ... I32(), None) + ... I32(), None) ... >>> func(100)[0] 101 @@ -92,7 +92,7 @@ def compile(program, arg_types, encoders, restype, decoder, conf=None): >>> func = compile("|x: i32, y: i32| x + y", ... [I32(), I32()], [None, None], - ... I32(), None) + ... I32(), None) ... >>> func(5, 6)[0] 11 @@ -101,7 +101,7 @@ def compile(program, arg_types, encoders, restype, decoder, conf=None): >>> func = compile("|x: i32| x + 1", ... [I32()], [PrimitiveWeldEncoder()], - ... I32(), PrimitiveWeldDecoder()) + ... I32(), PrimitiveWeldDecoder()) ... >>> func(100)[0] 101 diff --git a/weld-python/weld/encoders/numpy.py b/weld-python/weld/encoders/numpy.py index 2f79593b6..23852f946 100644 --- a/weld-python/weld/encoders/numpy.py +++ b/weld-python/weld/encoders/numpy.py @@ -268,7 +268,7 @@ class StringConversionFuncs(object): """ stringfuncs = ctypes.PyDLL(weld.encoders._strings.__file__) - string_cclass = WeldVec(I8()).ctype_class + string_cclass = WeldVec(WeldVec(I8())).ctype_class @staticmethod def numpy_string_array_to_weld(arr): diff --git a/weld-python/weld/grizzly/series.py b/weld-python/weld/grizzly/series.py index db0080c73..143b01730 100644 --- a/weld-python/weld/grizzly/series.py +++ b/weld-python/weld/grizzly/series.py @@ -19,8 +19,7 @@ def _grizzlyseries_constructor_with_fallback(data=None, **kwargs): """ A flexible constructor for Series._constructor, which needs to be able - to fall back to a Series (if a certain operation does not produce - geometries) + to fall back to a Series (if a certain operation cannot produce GrizzlySeries). """ try: return GrizzlySeries(data=data, **kwargs) @@ -33,16 +32,75 @@ class StringMethods(object): """ + __slots__ = [ "series" ] + def __init__(self, series): - # TODO(soumik.palkar): This probably needs to take an extension array, because - # Pandas doesn't like the 'S' dtype. if series.dtype.char != 'S': raise ValueError("StringMethods only available for Series with dtype 'S'") self.series = series + def to_pandas(self): + """ + Convert a GrizzlySeries of strings to a Pandas series. + + We provide a specialized implementation of `to_pandas` here that will perform UTF-8 decoding + of the raw bytestrings that Grizzly series operate over. + + Examples + -------- + >>> x = GrizzlySeries(["Welcome", "to", "Grizzly!"]) + >>> x + 0 b'Welcome' + 1 b'to' + 2 b'Grizzly!' + dtype: bytes64 + >>> x.str.to_pandas() + 0 Welcome + 1 to + 2 Grizzly! + dtype: object + + """ + return self.series.evaluate().to_pandas().str.decode("utf-8") + + def _apply(self, func): + """ + Apply the given weldfunc to `self.series` and return a new GrizzlySeries. + + """ + lazy = func(self.series.weld_value_)(self.series.output_type, GrizzlySeries._decoder) + return GrizzlySeries(lazy, dtype='S') + def lower(self): - lazy = weldstr.lower(self.weld_value_)(self.output_type, GrizzlySeries._decoder) - return GrizzlySeries(lazy, dtype=self.dtype) + """ + Lowercase strings in a GrizzlySeries. + + Examples + -------- + >>> x = GrizzlySeries(["HELLO", "WorLD"]) + >>> x.str.lower().str.to_pandas() + 0 hello + 1 world + dtype: object + + """ + return self._apply(weldstr.lower) + + def upper(self): + """ + Uppercase strings in a GrizzlySeries. + + Examples + -------- + >>> x = GrizzlySeries(["hello", "WorlD"]) + >>> x.str.upper().str.to_pandas() + 0 HELLO + 1 WORLD + dtype: object + + """ + return self._apply(weldstr.upper) + class GrizzlySeries(pd.Series): """ @@ -266,7 +324,7 @@ def __new__(cls, data, dtype=None, index=None, **kwargs): s = None if isinstance(data, WeldLazy): self = super(GrizzlySeries, cls).__new__(cls) - super(GrizzlySeries, self).__init__(None, dtype=dtype, **kwargs) + super(GrizzlySeries, self).__init__(np.array([], dtype=dtype), **kwargs) self.weld_value_ = data return self @@ -280,7 +338,13 @@ def __new__(cls, data, dtype=None, index=None, **kwargs): # Unsupported arguments present: bail for now. return pd.Series(data, dtype=dtype, index=index, **kwargs) - if not isinstance(data, np.ndarray): + if isinstance(data, list) and len(data) > 0 and isinstance(data[0], str): + # Try to convert a list of strings into a supported Numpy array. + data = np.array(data, dtype='S') + + if isinstance(data, pd.Series): + data = data.values + elif not isinstance(data, np.ndarray): # First, convert the input into a Series backed by an ndarray. s = pd.Series(data, dtype=dtype, index=index, **kwargs) data = s.values diff --git a/weld-python/weld/grizzly/weld/str.py b/weld-python/weld/grizzly/weld/str.py new file mode 100644 index 000000000..30025fe1e --- /dev/null +++ b/weld-python/weld/grizzly/weld/str.py @@ -0,0 +1,79 @@ +""" +String functions exported as weldfuncs. + +Each function takes an argument representing an array of strings, and outputs a +program that applies some transformation on each string. The functions are annotated +with `weld.lazy.weldfunc`, so they accept `WeldLazy` objects and return functions +for constructing Weld programs. + +""" + +import weld.lazy + +@weld.lazy.weldfunc +def lower(stringarr): + """ + Convert values to lowercase. + + """ + return """map( + {stringarr}, + |e: vec[i8]| + result( + for(e, + appender[i8], + |c: appender[i8], j: i64, f: i8| + if(f > 64c && f < 91c, + merge(c, f + 32c), + merge(c, f)) + ) + ) + )""".format(stringarr=stringarr) + + +@weld.lazy.weldfunc +def upper(stringarr): + """ + Convert values to uppercase. + + """ + return """map( + {stringarr}, + |e: vec[i8]| + result( + for(e, + appender[i8], + |c: appender[i8], j: i64, f: i8| + if(f > 96c && f < 123c, + merge(c, f - 32c), + merge(c, f)) + ) + ) + )""".format(stringarr=stringarr) + +@weld.lazy.weldfunc +def weld_str_capitalize(stringarr): + """ + Capitalize first letter. + + """ + return """map( + {stringarr}, + |e: vec[i8]| + let lenString = len(e); + if(lenString > 0L, + let res = appender[i8]; + let firstChar = lookup(e, 0L); + let res = if(firstChar > 96c && firstChar < 123c, merge(res, firstChar - 32c), merge(res, firstChar)); + result( + for(slice(e, 1L, lenString - 1L), + res, + |c: appender[i8], j: i64, f: i8| + if(f > 64c && f < 91c, + merge(c, f + 32c), + merge(c, f) + ) + ) + ), + e) + )""".format(stringarr=stringarr) From f7ef0cd643d2809503bd3f8f139158e72b63772f Mon Sep 17 00:00:00 2001 From: Shoumik Palkar Date: Wed, 18 Mar 2020 18:37:35 -0700 Subject: [PATCH 3/9] add some string methods --- weld-python/weld/compile.py | 1 + weld-python/weld/grizzly/series.py | 63 +++++++++++++++++++++++++--- weld-python/weld/grizzly/weld/str.py | 48 ++++++++++++++++++++- weld/src/lib.rs | 2 +- 4 files changed, 107 insertions(+), 7 deletions(-) diff --git a/weld-python/weld/compile.py b/weld-python/weld/compile.py index 8ef0586d0..160048c79 100644 --- a/weld-python/weld/compile.py +++ b/weld-python/weld/compile.py @@ -158,6 +158,7 @@ def func(*args, context=None): raw_args_pointer = ctypes.addressof(raw_args) value = WeldValue(raw_args_pointer) + if context is None: context = WeldContext(conf) diff --git a/weld-python/weld/grizzly/series.py b/weld-python/weld/grizzly/series.py index 143b01730..0be2eb8c5 100644 --- a/weld-python/weld/grizzly/series.py +++ b/weld-python/weld/grizzly/series.py @@ -28,7 +28,9 @@ def _grizzlyseries_constructor_with_fallback(data=None, **kwargs): class StringMethods(object): """ - String methods for Grizzly Series. + String methods for Grizzly. Currently, string methods only apply to ASCII + strings; while users can pass UTF-8 strings into Grizzly, their codepoints + will be ignored by the below operations and will be returned unmodified. """ @@ -63,17 +65,17 @@ def to_pandas(self): """ return self.series.evaluate().to_pandas().str.decode("utf-8") - def _apply(self, func): + def _apply(self, func, *args): """ Apply the given weldfunc to `self.series` and return a new GrizzlySeries. """ - lazy = func(self.series.weld_value_)(self.series.output_type, GrizzlySeries._decoder) + lazy = func(self.series.weld_value_, *args)(self.series.output_type, GrizzlySeries._decoder) return GrizzlySeries(lazy, dtype='S') def lower(self): """ - Lowercase strings in a GrizzlySeries. + Lowercase strings. Examples -------- @@ -88,7 +90,7 @@ def lower(self): def upper(self): """ - Uppercase strings in a GrizzlySeries. + Uppercase strings. Examples -------- @@ -101,6 +103,57 @@ def upper(self): """ return self._apply(weldstr.upper) + def capitalize(self): + """ + Capitalize the first character in each string. + + Examples + -------- + >>> x = GrizzlySeries(["hello", "worlD"]) + >>> x.str.capitalize().str.to_pandas() + 0 Hello + 1 World + dtype: object + + """ + return self._apply(weldstr.capitalize) + + def get(self, index): + """ + Get the character at index 'i' from each string. If 'index' is greater than + the string length, this returns an empty string. If 'index' is less than 0, + this wraps around, using Python's indexing behavior. + + Examples + -------- + >>> x = GrizzlySeries(["hello", "worlD"]) + >>> x.str.get(4).str.to_pandas() + 0 o + 1 D + dtype: object + >>> x.str.get(-3).str.to_pandas() + 0 l + 1 r + dtype: object + + """ + return self._apply(weldstr.get, index) + + def strip(self): + """ + Strip whitespace from the string. + + Examples + -------- + >>> x = GrizzlySeries([" hello ", " world \t "]) + >>> x.str.strip().str.to_pandas() + 0 hello + 1 world + dtype: object + + """ + return self._apply(weldstr.strip) + class GrizzlySeries(pd.Series): """ diff --git a/weld-python/weld/grizzly/weld/str.py b/weld-python/weld/grizzly/weld/str.py index 30025fe1e..46199ab84 100644 --- a/weld-python/weld/grizzly/weld/str.py +++ b/weld-python/weld/grizzly/weld/str.py @@ -52,7 +52,7 @@ def upper(stringarr): )""".format(stringarr=stringarr) @weld.lazy.weldfunc -def weld_str_capitalize(stringarr): +def capitalize(stringarr): """ Capitalize first letter. @@ -77,3 +77,49 @@ def weld_str_capitalize(stringarr): ), e) )""".format(stringarr=stringarr) + + +@weld.lazy.weldfunc +def get(stringarr, i): + """ + Retrieves the character at index 'i'. + + If 'i' is greater than the string length, returns '\0'. + + """ + i = "i64({})".format(i) + return """map( + {stringarr}, + |e: vec[i8]| + let lenString = len(e); + if({i} >= lenString, + [0c], + if({i} > 0L, + result(merge(appender[i8], lookup(slice(e, 0L, lenString), {i}))), + if ({i} > -lenString, + result(merge(appender[i8], lookup(slice(e, lenString, {i}), {i}))), + [0c] + ) + ) + ) + )""".format(stringarr=stringarr, i=i) + + +@weld.lazy.weldfunc +def strip(stringarr): + """ + Strip whitespace from the start of each string. + + """ + # From https://en.wikipedia.org/wiki/Whitespace_character. + is_whitespace = "((lookup(e, p) == 32c) || (lookup(e, p) >= 9c && lookup(e, p) <= 13c))" + # +3L = +1 compensate start_i already +1'ed, +1 compensate end_i already -1'ed, +1 compensate for slice with size + return """map( + {stringarr}, + |e: vec[i8]| + let lenString = len(e); + let start_i = iterate(0L, |p| {{p + 1L, p < lenString && {is_whitespace}}}); + let end_i = iterate(lenString - 1L, |p| {{p - 1L, p > start_i && {is_whitespace}}}); + # slice(e, start_i - 1L, lenString - start_i + 1L) + slice(e, start_i - 1L, end_i - start_i + 3L) + )""".format(stringarr=stringarr, is_whitespace=is_whitespace) diff --git a/weld/src/lib.rs b/weld/src/lib.rs index cbfd717ff..4c267a1e6 100644 --- a/weld/src/lib.rs +++ b/weld/src/lib.rs @@ -713,7 +713,7 @@ impl WeldModule { use crate::sir::optimizations; info!("Applying SIR optimizations"); optimizations::fold_constants::fold_constants(&mut sir_prog)?; - optimizations::simplify_assignments::simplify_assignments(&mut sir_prog)?; + // optimizations::simplify_assignments::simplify_assignments(&mut sir_prog)?; } let end = PreciseTime::now(); debug!("Optimized SIR program:\n{}\n", &sir_prog); From 67081c6a42fd8f1fff1061b4d02b7e853f38be5d Mon Sep 17 00:00:00 2001 From: Shoumik Palkar Date: Thu, 19 Mar 2020 20:55:36 -0700 Subject: [PATCH 4/9] add contains implementation --- weld-python/weld/grizzly/series.py | 34 ++++++++++++++--- weld-python/weld/grizzly/weld/str.py | 56 +++++++++++++++++++++++++++- 2 files changed, 84 insertions(+), 6 deletions(-) diff --git a/weld-python/weld/grizzly/series.py b/weld-python/weld/grizzly/series.py index 0be2eb8c5..1d1f9b019 100644 --- a/weld-python/weld/grizzly/series.py +++ b/weld-python/weld/grizzly/series.py @@ -65,13 +65,18 @@ def to_pandas(self): """ return self.series.evaluate().to_pandas().str.decode("utf-8") - def _apply(self, func, *args): + def _apply(self, func, *args, return_weld_elem_type=None): """ Apply the given weldfunc to `self.series` and return a new GrizzlySeries. + If the return type of the result is not a string GrizzlySeries, pass + 'return_weld_elem_type' to specify the element type of the result. + """ - lazy = func(self.series.weld_value_, *args)(self.series.output_type, GrizzlySeries._decoder) - return GrizzlySeries(lazy, dtype='S') + output_type = self.series.output_type if return_weld_elem_type is None else WeldVec(return_weld_elem_type) + dtype = 'S' if return_weld_elem_type is None else wenp.weld_type_to_dtype(return_weld_elem_type) + lazy = func(self.series.weld_value_, *args)(output_type, GrizzlySeries._decoder) + return GrizzlySeries(lazy, dtype=dtype) def lower(self): """ @@ -154,6 +159,25 @@ def strip(self): """ return self._apply(weldstr.strip) + def contains(self, pat): + """ + Returns whether each string contains the provided pattern. + + Pattern must be a Python string. + + Examples + -------- + >>> x = GrizzlySeries(["hello", "world"]) + >>> x.str.contains('wor').evaluate() + 0 False + 1 True + dtype: bool + + """ + if not isinstance(pat, str): + raise TypeError("pattern in contains must be a Python 'str'") + return self._apply(weldstr.contains, pat, return_weld_elem_type=Bool()) + class GrizzlySeries(pd.Series): """ @@ -367,9 +391,9 @@ def __new__(cls, data, dtype=None, index=None, **kwargs): >>> x = GrizzlySeries(np.ones(5)) >>> x.__class__ - >>> y = GrizzlySeries(['hi', 'bye']) # Unsupported + >>> y = GrizzlySeries(['hi', 'bye']) >>> y.__class__ - + >>> y = GrizzlySeries([1, 2, 3], index=[1, 0, 2]) # Unsupported >>> y.__class__ diff --git a/weld-python/weld/grizzly/weld/str.py b/weld-python/weld/grizzly/weld/str.py index 46199ab84..c1bbb4194 100644 --- a/weld-python/weld/grizzly/weld/str.py +++ b/weld-python/weld/grizzly/weld/str.py @@ -10,6 +10,18 @@ import weld.lazy +def string_to_weld_literal(s): + """ + Converts a string to a UTF-8 encoded Weld literal byte-vector. + + Examples + -------- + >>> string_to_weld_literal('hello') + '[104c,101c,108c,108c,111c]' + + """ + return "[" + ",".join([str(b) + 'c' for b in list(s.encode('utf-8'))]) + "]" + @weld.lazy.weldfunc def lower(stringarr): """ @@ -120,6 +132,48 @@ def strip(stringarr): let lenString = len(e); let start_i = iterate(0L, |p| {{p + 1L, p < lenString && {is_whitespace}}}); let end_i = iterate(lenString - 1L, |p| {{p - 1L, p > start_i && {is_whitespace}}}); - # slice(e, start_i - 1L, lenString - start_i + 1L) slice(e, start_i - 1L, end_i - start_i + 3L) )""".format(stringarr=stringarr, is_whitespace=is_whitespace) + +@weld.lazy.weldfunc +def contains(stringarr, pat): + """ + Check whether each element contains the substring 'pat', and returns + a boolean array of the results. + + For now, 'pat' must be a string literal. + """ + define_pat = "let pat = {};".format(string_to_weld_literal(pat)) + + return """ + {define_pat} + let lenPat = len(pat); + map({stringarr}, + |e: vec[i8]| + let lenString = len(e); + if(lenPat > lenString, + false, + # start by assuming pat is not found, until proven it is + let words_iter_res = iterate({{0L, false}}, + |p| + let e_i = p.$0; + let pat_i = 0L; + # start by assuming the substring and pat are the same, until proven otherwise + let word_check_res = iterate({{e_i, pat_i, true}}, + |q| + let found = lookup(e, q.$0) == lookup(pat, q.$1); + {{ + {{q.$0 + 1L, q.$1 + 1L, found}}, + q.$1 + 1L < lenPat && + found == true + }} + ).$2; + {{ + {{p.$0 + 1L, word_check_res}}, + p.$0 + lenPat < lenString && + word_check_res == false + }} + ).$1; + words_iter_res + ) + )""".format(stringarr=stringarr, define_pat=define_pat) From 4346ad37c6561310c78717a23072f6426287d984 Mon Sep 17 00:00:00 2001 From: Shoumik Palkar Date: Fri, 20 Mar 2020 17:02:25 -0700 Subject: [PATCH 5/9] more string functions --- weld-python/weld/grizzly/series.py | 87 ++++++++++++ weld-python/weld/grizzly/weld/str.py | 196 +++++++++++++++++++++++++++ 2 files changed, 283 insertions(+) diff --git a/weld-python/weld/grizzly/series.py b/weld-python/weld/grizzly/series.py index 1d1f9b019..3c4b69207 100644 --- a/weld-python/weld/grizzly/series.py +++ b/weld-python/weld/grizzly/series.py @@ -178,6 +178,93 @@ def contains(self, pat): raise TypeError("pattern in contains must be a Python 'str'") return self._apply(weldstr.contains, pat, return_weld_elem_type=Bool()) + def startswith(self, pat): + """ + Returns whether each string starts with the provided pattern. + + Pattern must be a Python string. + + Examples + -------- + >>> x = GrizzlySeries(["hello", "world"]) + >>> x.str.startswith('wo').evaluate() + 0 False + 1 True + dtype: bool + + """ + if not isinstance(pat, str): + raise TypeError("pattern in startswith must be a Python 'str'") + return self._apply(weldstr.startswith, pat, return_weld_elem_type=Bool()) + + def endswith(self, pat): + """ + Returns whether each string starts with the provided pattern. + + Pattern must be a Python string. + + Examples + -------- + >>> x = GrizzlySeries(["hello", "world"]) + >>> x.str.endswith('rld').evaluate() + 0 False + 1 True + dtype: bool + + """ + if not isinstance(pat, str): + raise TypeError("pattern in endswith must be a Python 'str'") + return self._apply(weldstr.endswith, pat, return_weld_elem_type=Bool()) + + def find(self, sub, start=0, end=None): + """ + Find 'sub' in each string. Each string is searched in the range [start,end]. + + 'sub' must be a Python string, and 'start' and 'end' must be Python integers. + + Examples + -------- + >>> x = GrizzlySeries(["bigfatcat", "fatcatbig", "reallybigcat"]) + >>> x.str.find('fat').evaluate() + 0 3 + 1 0 + 2 -1 + dtype: int64 + >>> x.str.find('big', end=2).evaluate() + 0 0 + 1 -1 + 2 -1 + dtype: int64 + + """ + if not isinstance(sub, str): + raise TypeError("sub in find must be a Python 'str'") + if not isinstance(start, int): + raise TypeError("start in find must be a Python 'int'") + if end is not None and not isinstance(end, int): + raise TypeError("end in find must be a Python 'int'") + return self._apply(weldstr.find, sub, start, end, return_weld_elem_type=I64()) + + def replace(self, pat, rep): + """ + Replaces the first occurrence of 'pat' with 'rep' in each string. + + Pattern and replacement must be Python strings. + + Examples + -------- + >>> x = GrizzlySeries(["hello", "world"]) + >>> x.str.replace('o', 'lalala').str.to_pandas() + 0 helllalala + 1 wlalalarld + dtype: object + + """ + if not isinstance(pat, str): + raise TypeError("pattern in replace must be a Python 'str'") + if not isinstance(rep, str): + raise TypeError("replacement in replace must be a Python 'str'") + return self._apply(weldstr.replace, pat, rep) class GrizzlySeries(pd.Series): """ diff --git a/weld-python/weld/grizzly/weld/str.py b/weld-python/weld/grizzly/weld/str.py index c1bbb4194..0565d0052 100644 --- a/weld-python/weld/grizzly/weld/str.py +++ b/weld-python/weld/grizzly/weld/str.py @@ -6,6 +6,11 @@ with `weld.lazy.weldfunc`, so they accept `WeldLazy` objects and return functions for constructing Weld programs. +These are adapted from +https://github.com/weld-project/baloo/blob/master/baloo/weld/weld_str.py. + +We may choose to re-implement these as UDF calls to Rust's UTF-8 library in the future. + """ import weld.lazy @@ -177,3 +182,194 @@ def contains(stringarr, pat): words_iter_res ) )""".format(stringarr=stringarr, define_pat=define_pat) + +@weld.lazy.weldfunc +def startswith(stringarr, pat): + """ + Check whether each element starts with the substring 'pat', and returns + a boolean array of the results. + + For now, 'pat' must be a string literal. + + """ + define_pat = "let pat = {};".format(string_to_weld_literal(pat)) + + return """ + {define_pat} + let lenPat = len(pat); + map({stringarr}, + |e: vec[i8]| + let lenString = len(e); + if(lenPat > lenString, + false, + iterate({{0L, true}}, + |q| + let found = lookup(e, q.$0) == lookup(pat, q.$0); + {{ + {{q.$0 + 1L, found}}, + q.$0 + 1L < lenPat && + found == true + }} + ).$1 + ) + )""".format(stringarr=stringarr, define_pat=define_pat) + + +@weld.lazy.weldfunc +def endswith(stringarr, pat): + """ + Check whether each element ends with the substring 'pat', and returns + a boolean array of the results. + + For now, 'pat' must be a string literal. + + """ + define_pat = "let pat = {};".format(string_to_weld_literal(pat)) + + return """ + {define_pat} + let lenPat = len(pat); + map({stringarr}, + |e: vec[i8]| + let lenString = len(e); + if(lenPat > lenString, + false, + iterate({{lenString - lenPat, 0L, true}}, + |q| + let found = lookup(e, q.$0) == lookup(pat, q.$1); + {{ + {{q.$0 + 1L, q.$1 + 1L, found}}, + q.$1 + 1L < lenPat && + found == true + }} + ).$2 + ) + )""".format(stringarr=stringarr, define_pat=define_pat) + +@weld.lazy.weldfunc +def find(stringarr, sub, start, end=None): + """ + Searches for 'sub' in each string in the range 'start', 'end'. Returns a + i64 array with -1 for unfound strings, or the index of the found string. + + 'sub' must be a Python string. 'start' and 'end' must be integers. + + """ + + start = "i64({})".format(start) + if end is None: + # -1L since we add 1L below to make the range inclusive in the size. + end = 'len(e) - 1L' + else: + end = "i64({})".format(end) + + define_sub = "let sub = {};".format(string_to_weld_literal(sub)) + + return """ + {define_sub} + let lenSub = len(sub); + map({stringarr}, + |e: vec[i8]| + let start = {start}; + let size = {end} - start + 1L; # Search [start:end], so add 1L. + if (start < 0L || start + size > len(e), + -1L, + let string = slice(e, start, size); + let lenString = len(string); + if(lenSub > lenString, + -1L, + # start by assuming sub is not found, until proven it is + let words_iter_res = iterate({{0L, false}}, + |p| + let e_i = p.$0; + let pat_i = 0L; + # start by assuming the substring and sub are the same, until proven otherwise + let word_check_res = iterate({{e_i, pat_i, true}}, + |q| + let found = lookup(string, q.$0) == lookup(sub, q.$1); + {{ + {{q.$0 + 1L, q.$1 + 1L, found}}, + q.$1 + 1L < lenSub && + found == true + }} + ).$2; + {{ + {{p.$0 + 1L, word_check_res}}, + p.$0 + lenSub < lenString && + word_check_res == false + }} + ); + if(words_iter_res.$1 == true, + words_iter_res.$0 - 1L + start, + -1L + ) + ) + ) + )""".format(stringarr=stringarr, define_sub=define_sub, start=start, end=end) + + +@weld.lazy.weldfunc +def replace(stringarr, pat, rep): + """ + Replace the first occurrence iof 'pat' in each string with with 'rep'. + + For now, 'pat' and 'rep' must be Python strings. + + """ + define_pat = "let pat = {};".format(string_to_weld_literal(pat)) + define_rep = "let rep = {};".format(string_to_weld_literal(rep)) + + return """ + {define_pat} + {define_rep} + let lenPat = len(pat); + map({stringarr}, + |e: vec[i8]| + let lenString = len(e); + if(lenPat > lenString, + e, + # start by assuming sub is not found, until proven it is + let words_iter_res = iterate({{0L, false}}, + |p| + let e_i = p.$0; + let pat_i = 0L; + # start by assuming the substring and sub are the same, until proven otherwise + let word_check_res = iterate({{e_i, pat_i, true}}, + |q| + let found = lookup(e, q.$0) == lookup(pat, q.$1); + {{ + {{q.$0 + 1L, q.$1 + 1L, found}}, + q.$1 + 1L < lenPat && + found == true + }} + ).$2; + {{ + {{p.$0 + 1L, word_check_res}}, + p.$0 + lenPat < lenString && + word_check_res == false + }} + ); + if(words_iter_res.$1 == true, + let rep_from = words_iter_res.$0 - 1L; + let rep_to = rep_from + lenPat; + let res = appender[i8]; + let res = for(slice(e, 0L, rep_from), + res, + |c: appender[i8], j: i64, f: i8| + merge(c, f) + ); + let res = for(rep, + res, + |c: appender[i8], j: i64, f: i8| + merge(c, f) + ); + let res = for(slice(e, rep_to, lenString), + res, + |c: appender[i8], j: i64, f: i8| + merge(c, f) + ); + result(res), + e + ) + ) + )""".format(stringarr=stringarr, define_pat=define_pat, define_rep=define_rep) From 3832175ae76a600b94ae6c3cb51d84ecd5d41f20 Mon Sep 17 00:00:00 2001 From: Shoumik Palkar Date: Fri, 20 Mar 2020 17:31:22 -0700 Subject: [PATCH 6/9] move code around, add some more string functions --- weld-python/tests/grizzly/test_series.py | 4 +- weld-python/weld/grizzly/__init__.py | 2 + weld-python/weld/grizzly/core/__init__.py | 1 + weld-python/weld/grizzly/{ => core}/error.py | 0 weld-python/weld/grizzly/{ => core}/series.py | 254 +---------------- weld-python/weld/grizzly/core/strings.py | 263 ++++++++++++++++++ 6 files changed, 275 insertions(+), 249 deletions(-) create mode 100644 weld-python/weld/grizzly/core/__init__.py rename weld-python/weld/grizzly/{ => core}/error.py (100%) rename weld-python/weld/grizzly/{ => core}/series.py (74%) create mode 100644 weld-python/weld/grizzly/core/strings.py diff --git a/weld-python/tests/grizzly/test_series.py b/weld-python/tests/grizzly/test_series.py index 3cece7fba..351dfc8ee 100644 --- a/weld-python/tests/grizzly/test_series.py +++ b/weld-python/tests/grizzly/test_series.py @@ -6,7 +6,7 @@ import numpy as np import pandas as pd import pytest -import weld.grizzly.series as gr +import weld.grizzly as gr types_ = ['int8', 'uint8', 'int16', 'uint16', 'int32',\ 'uint32', 'int64', 'uint64', 'float32', 'float64'] @@ -134,7 +134,7 @@ def test_indexing(): def test_unsupported_binop_error(): # Test unsupported - from weld.grizzly.error import GrizzlyError + from weld.grizzly.core.error import GrizzlyError with pytest.raises(GrizzlyError): a = gr.GrizzlySeries([1,2,3]) b = pd.Series([1,2,3]) diff --git a/weld-python/weld/grizzly/__init__.py b/weld-python/weld/grizzly/__init__.py index e69de29bb..963902a18 100644 --- a/weld-python/weld/grizzly/__init__.py +++ b/weld-python/weld/grizzly/__init__.py @@ -0,0 +1,2 @@ + +from weld.grizzly.core.series import GrizzlySeries diff --git a/weld-python/weld/grizzly/core/__init__.py b/weld-python/weld/grizzly/core/__init__.py new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/weld-python/weld/grizzly/core/__init__.py @@ -0,0 +1 @@ + diff --git a/weld-python/weld/grizzly/error.py b/weld-python/weld/grizzly/core/error.py similarity index 100% rename from weld-python/weld/grizzly/error.py rename to weld-python/weld/grizzly/core/error.py diff --git a/weld-python/weld/grizzly/series.py b/weld-python/weld/grizzly/core/series.py similarity index 74% rename from weld-python/weld/grizzly/series.py rename to weld-python/weld/grizzly/core/series.py index 3c4b69207..0b5b2b124 100644 --- a/weld-python/weld/grizzly/series.py +++ b/weld-python/weld/grizzly/core/series.py @@ -10,11 +10,11 @@ import weld.encoders.numpy as wenp import weld.grizzly.weld.str as weldstr -from pandas.core.internals import SingleBlockManager from weld.lazy import PhysicalValue, WeldLazy, WeldNode, identity +from weld.grizzly.weld.ops import * +from weld.grizzly.core.error import GrizzlyError +from weld.grizzly.core.strings import StringMethods from weld.types import * -from .weld.ops import * -from .error import * def _grizzlyseries_constructor_with_fallback(data=None, **kwargs): """ @@ -26,246 +26,6 @@ def _grizzlyseries_constructor_with_fallback(data=None, **kwargs): except TypeError: return pd.Series(data=data, **kwargs) -class StringMethods(object): - """ - String methods for Grizzly. Currently, string methods only apply to ASCII - strings; while users can pass UTF-8 strings into Grizzly, their codepoints - will be ignored by the below operations and will be returned unmodified. - - """ - - __slots__ = [ "series" ] - - def __init__(self, series): - if series.dtype.char != 'S': - raise ValueError("StringMethods only available for Series with dtype 'S'") - self.series = series - - def to_pandas(self): - """ - Convert a GrizzlySeries of strings to a Pandas series. - - We provide a specialized implementation of `to_pandas` here that will perform UTF-8 decoding - of the raw bytestrings that Grizzly series operate over. - - Examples - -------- - >>> x = GrizzlySeries(["Welcome", "to", "Grizzly!"]) - >>> x - 0 b'Welcome' - 1 b'to' - 2 b'Grizzly!' - dtype: bytes64 - >>> x.str.to_pandas() - 0 Welcome - 1 to - 2 Grizzly! - dtype: object - - """ - return self.series.evaluate().to_pandas().str.decode("utf-8") - - def _apply(self, func, *args, return_weld_elem_type=None): - """ - Apply the given weldfunc to `self.series` and return a new GrizzlySeries. - - If the return type of the result is not a string GrizzlySeries, pass - 'return_weld_elem_type' to specify the element type of the result. - - """ - output_type = self.series.output_type if return_weld_elem_type is None else WeldVec(return_weld_elem_type) - dtype = 'S' if return_weld_elem_type is None else wenp.weld_type_to_dtype(return_weld_elem_type) - lazy = func(self.series.weld_value_, *args)(output_type, GrizzlySeries._decoder) - return GrizzlySeries(lazy, dtype=dtype) - - def lower(self): - """ - Lowercase strings. - - Examples - -------- - >>> x = GrizzlySeries(["HELLO", "WorLD"]) - >>> x.str.lower().str.to_pandas() - 0 hello - 1 world - dtype: object - - """ - return self._apply(weldstr.lower) - - def upper(self): - """ - Uppercase strings. - - Examples - -------- - >>> x = GrizzlySeries(["hello", "WorlD"]) - >>> x.str.upper().str.to_pandas() - 0 HELLO - 1 WORLD - dtype: object - - """ - return self._apply(weldstr.upper) - - def capitalize(self): - """ - Capitalize the first character in each string. - - Examples - -------- - >>> x = GrizzlySeries(["hello", "worlD"]) - >>> x.str.capitalize().str.to_pandas() - 0 Hello - 1 World - dtype: object - - """ - return self._apply(weldstr.capitalize) - - def get(self, index): - """ - Get the character at index 'i' from each string. If 'index' is greater than - the string length, this returns an empty string. If 'index' is less than 0, - this wraps around, using Python's indexing behavior. - - Examples - -------- - >>> x = GrizzlySeries(["hello", "worlD"]) - >>> x.str.get(4).str.to_pandas() - 0 o - 1 D - dtype: object - >>> x.str.get(-3).str.to_pandas() - 0 l - 1 r - dtype: object - - """ - return self._apply(weldstr.get, index) - - def strip(self): - """ - Strip whitespace from the string. - - Examples - -------- - >>> x = GrizzlySeries([" hello ", " world \t "]) - >>> x.str.strip().str.to_pandas() - 0 hello - 1 world - dtype: object - - """ - return self._apply(weldstr.strip) - - def contains(self, pat): - """ - Returns whether each string contains the provided pattern. - - Pattern must be a Python string. - - Examples - -------- - >>> x = GrizzlySeries(["hello", "world"]) - >>> x.str.contains('wor').evaluate() - 0 False - 1 True - dtype: bool - - """ - if not isinstance(pat, str): - raise TypeError("pattern in contains must be a Python 'str'") - return self._apply(weldstr.contains, pat, return_weld_elem_type=Bool()) - - def startswith(self, pat): - """ - Returns whether each string starts with the provided pattern. - - Pattern must be a Python string. - - Examples - -------- - >>> x = GrizzlySeries(["hello", "world"]) - >>> x.str.startswith('wo').evaluate() - 0 False - 1 True - dtype: bool - - """ - if not isinstance(pat, str): - raise TypeError("pattern in startswith must be a Python 'str'") - return self._apply(weldstr.startswith, pat, return_weld_elem_type=Bool()) - - def endswith(self, pat): - """ - Returns whether each string starts with the provided pattern. - - Pattern must be a Python string. - - Examples - -------- - >>> x = GrizzlySeries(["hello", "world"]) - >>> x.str.endswith('rld').evaluate() - 0 False - 1 True - dtype: bool - - """ - if not isinstance(pat, str): - raise TypeError("pattern in endswith must be a Python 'str'") - return self._apply(weldstr.endswith, pat, return_weld_elem_type=Bool()) - - def find(self, sub, start=0, end=None): - """ - Find 'sub' in each string. Each string is searched in the range [start,end]. - - 'sub' must be a Python string, and 'start' and 'end' must be Python integers. - - Examples - -------- - >>> x = GrizzlySeries(["bigfatcat", "fatcatbig", "reallybigcat"]) - >>> x.str.find('fat').evaluate() - 0 3 - 1 0 - 2 -1 - dtype: int64 - >>> x.str.find('big', end=2).evaluate() - 0 0 - 1 -1 - 2 -1 - dtype: int64 - - """ - if not isinstance(sub, str): - raise TypeError("sub in find must be a Python 'str'") - if not isinstance(start, int): - raise TypeError("start in find must be a Python 'int'") - if end is not None and not isinstance(end, int): - raise TypeError("end in find must be a Python 'int'") - return self._apply(weldstr.find, sub, start, end, return_weld_elem_type=I64()) - - def replace(self, pat, rep): - """ - Replaces the first occurrence of 'pat' with 'rep' in each string. - - Pattern and replacement must be Python strings. - - Examples - -------- - >>> x = GrizzlySeries(["hello", "world"]) - >>> x.str.replace('o', 'lalala').str.to_pandas() - 0 helllalala - 1 wlalalarld - dtype: object - - """ - if not isinstance(pat, str): - raise TypeError("pattern in replace must be a Python 'str'") - if not isinstance(rep, str): - raise TypeError("replacement in replace must be a Python 'str'") - return self._apply(weldstr.replace, pat, rep) - class GrizzlySeries(pd.Series): """ A lazy `Series` object backed by a Weld computation. @@ -353,7 +113,7 @@ def values(self): >>> x.values Traceback (most recent call last): ... - weld.grizzly.error.GrizzlyError: GrizzlySeries is not evaluated and does not have values. Try calling 'evaluate()' first. + weld.grizzly.core.error.GrizzlyError: GrizzlySeries is not evaluated and does not have values. Try calling 'evaluate()' first. """ if not self.is_value: raise GrizzlyError("GrizzlySeries is not evaluated and does not have values. Try calling 'evaluate()' first.") @@ -474,13 +234,13 @@ def __new__(cls, data, dtype=None, index=None, **kwargs): 2 3 dtype: int64 >>> x.__class__ - + >>> x = GrizzlySeries(np.ones(5)) >>> x.__class__ - + >>> y = GrizzlySeries(['hi', 'bye']) >>> y.__class__ - + >>> y = GrizzlySeries([1, 2, 3], index=[1, 0, 2]) # Unsupported >>> y.__class__ diff --git a/weld-python/weld/grizzly/core/strings.py b/weld-python/weld/grizzly/core/strings.py new file mode 100644 index 000000000..6db4162ee --- /dev/null +++ b/weld-python/weld/grizzly/core/strings.py @@ -0,0 +1,263 @@ +""" +String methods supported by Series. + +""" + +import weld.encoders.numpy as wenp +import weld.grizzly.weld.str as weldstr + +from weld.types import * + +class StringMethods(object): + """ + String methods for Grizzly. Currently, string methods only apply to ASCII + strings; while users can pass UTF-8 strings into Grizzly, their codepoints + will be ignored by the below operations and will be returned unmodified. + + """ + + __slots__ = [ "series", "constructor" ] + + def __init__(self, series): + if series.dtype.char != 'S': + raise ValueError("StringMethods only available for Series with dtype 'S'") + self.series = series + # TODO(shoumik): This is a hack: we should define an abstract class that captures + # the interface additional functionality needs. + self.constructor = self.series.__class__ + + def to_pandas(self): + """ + Convert an array of strings to a Pandas series. + + We provide a specialized implementation of `to_pandas` here that will perform UTF-8 decoding + of the raw bytestrings that Grizzly series operate over. + + Examples + -------- + >>> from weld.grizzly import GrizzlySeries + >>> x = GrizzlySeries(["Welcome", "to", "Grizzly!"]) + >>> x + 0 b'Welcome' + 1 b'to' + 2 b'Grizzly!' + dtype: bytes64 + >>> x.str.to_pandas() + 0 Welcome + 1 to + 2 Grizzly! + dtype: object + + """ + return self.series.evaluate().to_pandas().str.decode("utf-8") + + def _apply(self, func, *args, return_weld_elem_type=None): + """ + Apply the given weldfunc to `self.series` and return a new GrizzlySeries. + + If the return type of the result is not a string GrizzlySeries, pass + 'return_weld_elem_type' to specify the element type of the result. + + """ + output_type = self.series.output_type if return_weld_elem_type is None else WeldVec(return_weld_elem_type) + dtype = 'S' if return_weld_elem_type is None else wenp.weld_type_to_dtype(return_weld_elem_type) + lazy = func(self.series.weld_value_, *args)(output_type, self.constructor._decoder) + return (self.constructor)(lazy, dtype=dtype) + + def lower(self): + """ + Lowercase strings. + + Examples + -------- + >>> from weld.grizzly import GrizzlySeries + >>> x = GrizzlySeries(["HELLO", "WorLD"]) + >>> x.str.lower().str.to_pandas() + 0 hello + 1 world + dtype: object + + """ + return self._apply(weldstr.lower) + + def upper(self): + """ + Uppercase strings. + + Examples + -------- + >>> from weld.grizzly import GrizzlySeries + >>> x = GrizzlySeries(["hello", "WorlD"]) + >>> x.str.upper().str.to_pandas() + 0 HELLO + 1 WORLD + dtype: object + + """ + return self._apply(weldstr.upper) + + def capitalize(self): + """ + Capitalize the first character in each string. + + Examples + -------- + >>> from weld.grizzly import GrizzlySeries + >>> x = GrizzlySeries(["hello", "worlD"]) + >>> x.str.capitalize().str.to_pandas() + 0 Hello + 1 World + dtype: object + + """ + return self._apply(weldstr.capitalize) + + def get(self, index): + """ + Get the character at index 'i' from each string. If 'index' is greater than + the string length, this returns an empty string. If 'index' is less than 0, + this wraps around, using Python's indexing behavior. + + Examples + -------- + >>> from weld.grizzly import GrizzlySeries + >>> x = GrizzlySeries(["hello", "worlD"]) + >>> x.str.get(4).str.to_pandas() + 0 o + 1 D + dtype: object + >>> x.str.get(-3).str.to_pandas() + 0 l + 1 r + dtype: object + + """ + return self._apply(weldstr.get, index) + + def strip(self): + """ + Strip whitespace from the string. + + Examples + -------- + >>> from weld.grizzly import GrizzlySeries + >>> x = GrizzlySeries([" hello ", " world \t "]) + >>> x.str.strip().str.to_pandas() + 0 hello + 1 world + dtype: object + + """ + return self._apply(weldstr.strip) + + def contains(self, pat): + """ + Returns whether each string contains the provided pattern. + + Pattern must be a Python string. + + Examples + -------- + >>> from weld.grizzly import GrizzlySeries + >>> x = GrizzlySeries(["hello", "world"]) + >>> x.str.contains('wor').evaluate() + 0 False + 1 True + dtype: bool + + """ + if not isinstance(pat, str): + raise TypeError("pattern in contains must be a Python 'str'") + return self._apply(weldstr.contains, pat, return_weld_elem_type=Bool()) + + def startswith(self, pat): + """ + Returns whether each string starts with the provided pattern. + + Pattern must be a Python string. + + Examples + -------- + >>> from weld.grizzly import GrizzlySeries + >>> x = GrizzlySeries(["hello", "world"]) + >>> x.str.startswith('wo').evaluate() + 0 False + 1 True + dtype: bool + + """ + if not isinstance(pat, str): + raise TypeError("pattern in startswith must be a Python 'str'") + return self._apply(weldstr.startswith, pat, return_weld_elem_type=Bool()) + + def endswith(self, pat): + """ + Returns whether each string starts with the provided pattern. + + Pattern must be a Python string. + + Examples + -------- + >>> from weld.grizzly import GrizzlySeries + >>> x = GrizzlySeries(["hello", "world"]) + >>> x.str.endswith('rld').evaluate() + 0 False + 1 True + dtype: bool + + """ + if not isinstance(pat, str): + raise TypeError("pattern in endswith must be a Python 'str'") + return self._apply(weldstr.endswith, pat, return_weld_elem_type=Bool()) + + def find(self, sub, start=0, end=None): + """ + Find 'sub' in each string. Each string is searched in the range [start,end]. + + 'sub' must be a Python string, and 'start' and 'end' must be Python integers. + + Examples + -------- + >>> from weld.grizzly import GrizzlySeries + >>> x = GrizzlySeries(["bigfatcat", "fatcatbig", "reallybigcat"]) + >>> x.str.find('fat').evaluate() + 0 3 + 1 0 + 2 -1 + dtype: int64 + >>> x.str.find('big', end=2).evaluate() + 0 0 + 1 -1 + 2 -1 + dtype: int64 + + """ + if not isinstance(sub, str): + raise TypeError("sub in find must be a Python 'str'") + if not isinstance(start, int): + raise TypeError("start in find must be a Python 'int'") + if end is not None and not isinstance(end, int): + raise TypeError("end in find must be a Python 'int'") + return self._apply(weldstr.find, sub, start, end, return_weld_elem_type=I64()) + + def replace(self, pat, rep): + """ + Replaces the first occurrence of 'pat' with 'rep' in each string. + + Pattern and replacement must be Python strings. + + Examples + -------- + >>> from weld.grizzly import GrizzlySeries + >>> x = GrizzlySeries(["hello", "world"]) + >>> x.str.replace('o', 'lalala').str.to_pandas() + 0 helllalala + 1 wlalalarld + dtype: object + + """ + if not isinstance(pat, str): + raise TypeError("pattern in replace must be a Python 'str'") + if not isinstance(rep, str): + raise TypeError("replacement in replace must be a Python 'str'") + return self._apply(weldstr.replace, pat, rep) From 59e4ccd41e8b7352c408c356d930511582a1fe41 Mon Sep 17 00:00:00 2001 From: Shoumik Palkar Date: Sat, 21 Mar 2020 00:10:24 -0700 Subject: [PATCH 7/9] add tests for strings --- .../tests/grizzly/{ => core}/test_series.py | 0 .../tests/grizzly/core/test_strings.py | 120 ++++++++++++++++++ weld-python/weld/grizzly/core/strings.py | 4 +- weld-python/weld/grizzly/weld/str.py | 7 +- 4 files changed, 125 insertions(+), 6 deletions(-) rename weld-python/tests/grizzly/{ => core}/test_series.py (100%) create mode 100644 weld-python/tests/grizzly/core/test_strings.py diff --git a/weld-python/tests/grizzly/test_series.py b/weld-python/tests/grizzly/core/test_series.py similarity index 100% rename from weld-python/tests/grizzly/test_series.py rename to weld-python/tests/grizzly/core/test_series.py diff --git a/weld-python/tests/grizzly/core/test_strings.py b/weld-python/tests/grizzly/core/test_strings.py new file mode 100644 index 000000000..cef4539cc --- /dev/null +++ b/weld-python/tests/grizzly/core/test_strings.py @@ -0,0 +1,120 @@ +""" +Test string functionality. + +The behavior is tested against Pandas unless noted otherwise. + +""" + +import numpy as np +import pandas as pd +import pytest +import weld.grizzly as gr + +# To check whether the output is a string. +# TODO(shoumik): There should be a better way to do this, another reason +# to use ExtensionArray and a custom dtype for Weldified string arrays. +from weld.types import WeldVec, I8 + +def compare_vs_pandas(func, strings, *args, **kwargs): + pandas_series = pd.Series(strings) + grizzly_series = gr.GrizzlySeries(strings) + + pandas_result = getattr(pandas_series.str, func)(*args, **kwargs) + grizzly_result = getattr(grizzly_series.str, func)(*args, **kwargs) + if grizzly_result.output_type.elem_type != WeldVec(I8()): + grizzly_result = grizzly_result.to_pandas() + else: + # Perform UTF-8 decoding. + grizzly_result = grizzly_result.str.to_pandas() + assert pandas_result.equals(grizzly_result) + +def test_to_pandas(): + pass + +# Strings to test capitalization functions. +capitals_strings = [ + "hello", "HELLO", "LonGHelLO", "", + "3.141592, it's pi!", "many words in this one"] + +def test_lower(): + compare_vs_pandas('lower', capitals_strings) + +def test_upper(): + compare_vs_pandas('upper', capitals_strings) + +def test_capitalize(): + compare_vs_pandas('capitalize', capitals_strings) + +def test_get(): + """ + Behavior of get is different in Grizzly -- it currently returns empty strings + in cases where Pandas returns NaN. This will be changed in a later patch. + + """ + inp = ["hello", "world", "test", "me", ''] + expect = ['l', 'l', 't', '', ''] + grizzly_result = gr.GrizzlySeries(inp).str.get(3).str.to_pandas() + pandas_result = pd.Series(expect) + assert pandas_result.equals(grizzly_result) + + expect = ['o', 'd', 't', 'e', ''] + grizzly_result = gr.GrizzlySeries(inp).str.get(-1).str.to_pandas() + pandas_result = pd.Series(expect) + assert pandas_result.equals(grizzly_result) + + expect = ['', '', '', '', ''] + grizzly_result = gr.GrizzlySeries(inp).str.get(-50).str.to_pandas() + pandas_result = pd.Series(expect) + assert pandas_result.equals(grizzly_result) + +def test_strip(): + compare_vs_pandas('strip', ["", + " hi ", + "\t\thi\n", + """ + + hello + + """, + " \t goodbye", + "goodbye again ", + " \n hi \n bye \n ", + """ + + hi + + bye + + """]) + +def test_contains(): + compare_vs_pandas('contains', ["abc", "abcdefg", "gfedcbaabcabcdef", ""], "abc") + +def test_startswith(): + compare_vs_pandas('startswith', ["abc", "abcdefg", "gfedcba", "", "defabc"], "abc") + +def test_endswith(): + compare_vs_pandas('endswith', ["abc", "abcdefg", "gfedabc", "", "defabc"], "abc") + +def test_find(): + compare_vs_pandas('find', ["abc", "abcdefg", "gfedcbaabcabcdef", ""], "abc") + compare_vs_pandas('find', ["abc", "abcdefg", "gfedcbaabcabcdef", ""], "abc", 2) + compare_vs_pandas('find', ["abc", "abcdefg", "gfedcbaabcabcdef", ""], "abc", 3) + compare_vs_pandas('find', ["abc", "abcdefg", "gfedcbaabcabcdef", ""], "abc", end=2) + compare_vs_pandas('find', ["abc", "abcdefg", "gfedcbaabcabcdef", ""], "abc", end=3) + compare_vs_pandas('find', ["abc", "abcdefg", "gfedcbaabcabcdef", ""], "abc", 3, end=7) + compare_vs_pandas('find', ["abc", "abcdefg", "gfedcbaabcabcdef", ""], "abc", 100, end=105) + +def test_replace(): + """ + Behavior of replace is different in Grizzly -- it currently only replaces the *first* + occurrance. This will be changed in a later patch. + + """ + import copy + inp = ["abc", "abcdefg", "abcabcabc", "gfedcbaabcabcdef", "", "XYZ"] + expect = [s.replace("abc", "XYZ", 1) for s in copy.copy(inp)] + grizzly_result = gr.GrizzlySeries(inp).str.replace("abc", "XYZ").str.to_pandas() + pandas_result = pd.Series(expect) + assert pandas_result.equals(grizzly_result) + diff --git a/weld-python/weld/grizzly/core/strings.py b/weld-python/weld/grizzly/core/strings.py index 6db4162ee..1616e5fc1 100644 --- a/weld-python/weld/grizzly/core/strings.py +++ b/weld-python/weld/grizzly/core/strings.py @@ -212,7 +212,7 @@ def endswith(self, pat): def find(self, sub, start=0, end=None): """ - Find 'sub' in each string. Each string is searched in the range [start,end]. + Find 'sub' in each string. Each string is searched in the range [start,end). 'sub' must be a Python string, and 'start' and 'end' must be Python integers. @@ -226,7 +226,7 @@ def find(self, sub, start=0, end=None): 2 -1 dtype: int64 >>> x.str.find('big', end=2).evaluate() - 0 0 + 0 -1 1 -1 2 -1 dtype: int64 diff --git a/weld-python/weld/grizzly/weld/str.py b/weld-python/weld/grizzly/weld/str.py index 0565d0052..7f7622a3d 100644 --- a/weld-python/weld/grizzly/weld/str.py +++ b/weld-python/weld/grizzly/weld/str.py @@ -258,8 +258,7 @@ def find(stringarr, sub, start, end=None): start = "i64({})".format(start) if end is None: - # -1L since we add 1L below to make the range inclusive in the size. - end = 'len(e) - 1L' + end = 'len(e)' else: end = "i64({})".format(end) @@ -271,8 +270,8 @@ def find(stringarr, sub, start, end=None): map({stringarr}, |e: vec[i8]| let start = {start}; - let size = {end} - start + 1L; # Search [start:end], so add 1L. - if (start < 0L || start + size > len(e), + let size = {end} - start; + if (start < 0L, -1L, let string = slice(e, start, size); let lenString = len(string); From c8fb7873595978d40567f5b7a575f64786d12eed Mon Sep 17 00:00:00 2001 From: Shoumik Palkar Date: Sat, 21 Mar 2020 00:10:58 -0700 Subject: [PATCH 8/9] remove some dead code --- weld-python/tests/grizzly/core/test_strings.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/weld-python/tests/grizzly/core/test_strings.py b/weld-python/tests/grizzly/core/test_strings.py index cef4539cc..e760c1cd0 100644 --- a/weld-python/tests/grizzly/core/test_strings.py +++ b/weld-python/tests/grizzly/core/test_strings.py @@ -28,9 +28,6 @@ def compare_vs_pandas(func, strings, *args, **kwargs): grizzly_result = grizzly_result.str.to_pandas() assert pandas_result.equals(grizzly_result) -def test_to_pandas(): - pass - # Strings to test capitalization functions. capitals_strings = [ "hello", "HELLO", "LonGHelLO", "", From 311d3143028aea7a32eaa77a053cd691b313241a Mon Sep 17 00:00:00 2001 From: Shoumik Palkar Date: Sat, 21 Mar 2020 09:42:37 -0700 Subject: [PATCH 9/9] add conftest --- weld-python/weld/conftest.py | 15 +++++++++++ weld-python/weld/grizzly/core/strings.py | 33 ++++++++---------------- 2 files changed, 26 insertions(+), 22 deletions(-) create mode 100644 weld-python/weld/conftest.py diff --git a/weld-python/weld/conftest.py b/weld-python/weld/conftest.py new file mode 100644 index 000000000..acbd85af3 --- /dev/null +++ b/weld-python/weld/conftest.py @@ -0,0 +1,15 @@ + +import pytest + +import numpy as np +import pandas as pd +import weld.grizzly as gr + +@pytest.fixture(autouse=True) +def add_imports(doctest_namespace): + """ + Make `gr`, `np`, and `pd available for doctests. + """ + doctest_namespace["np"] = np + doctest_namespace["pd"] = pd + doctest_namespace["gr"] = gr diff --git a/weld-python/weld/grizzly/core/strings.py b/weld-python/weld/grizzly/core/strings.py index 1616e5fc1..e603a8cc6 100644 --- a/weld-python/weld/grizzly/core/strings.py +++ b/weld-python/weld/grizzly/core/strings.py @@ -35,8 +35,7 @@ def to_pandas(self): Examples -------- - >>> from weld.grizzly import GrizzlySeries - >>> x = GrizzlySeries(["Welcome", "to", "Grizzly!"]) + >>> x = gr.GrizzlySeries(["Welcome", "to", "Grizzly!"]) >>> x 0 b'Welcome' 1 b'to' @@ -70,8 +69,7 @@ def lower(self): Examples -------- - >>> from weld.grizzly import GrizzlySeries - >>> x = GrizzlySeries(["HELLO", "WorLD"]) + >>> x = gr.GrizzlySeries(["HELLO", "WorLD"]) >>> x.str.lower().str.to_pandas() 0 hello 1 world @@ -86,8 +84,7 @@ def upper(self): Examples -------- - >>> from weld.grizzly import GrizzlySeries - >>> x = GrizzlySeries(["hello", "WorlD"]) + >>> x = gr.GrizzlySeries(["hello", "WorlD"]) >>> x.str.upper().str.to_pandas() 0 HELLO 1 WORLD @@ -102,8 +99,7 @@ def capitalize(self): Examples -------- - >>> from weld.grizzly import GrizzlySeries - >>> x = GrizzlySeries(["hello", "worlD"]) + >>> x = gr.GrizzlySeries(["hello", "worlD"]) >>> x.str.capitalize().str.to_pandas() 0 Hello 1 World @@ -120,8 +116,7 @@ def get(self, index): Examples -------- - >>> from weld.grizzly import GrizzlySeries - >>> x = GrizzlySeries(["hello", "worlD"]) + >>> x = gr.GrizzlySeries(["hello", "worlD"]) >>> x.str.get(4).str.to_pandas() 0 o 1 D @@ -140,8 +135,7 @@ def strip(self): Examples -------- - >>> from weld.grizzly import GrizzlySeries - >>> x = GrizzlySeries([" hello ", " world \t "]) + >>> x = gr.GrizzlySeries([" hello ", " world \t "]) >>> x.str.strip().str.to_pandas() 0 hello 1 world @@ -158,8 +152,7 @@ def contains(self, pat): Examples -------- - >>> from weld.grizzly import GrizzlySeries - >>> x = GrizzlySeries(["hello", "world"]) + >>> x = gr.GrizzlySeries(["hello", "world"]) >>> x.str.contains('wor').evaluate() 0 False 1 True @@ -178,8 +171,7 @@ def startswith(self, pat): Examples -------- - >>> from weld.grizzly import GrizzlySeries - >>> x = GrizzlySeries(["hello", "world"]) + >>> x = gr.GrizzlySeries(["hello", "world"]) >>> x.str.startswith('wo').evaluate() 0 False 1 True @@ -198,8 +190,7 @@ def endswith(self, pat): Examples -------- - >>> from weld.grizzly import GrizzlySeries - >>> x = GrizzlySeries(["hello", "world"]) + >>> x = gr.GrizzlySeries(["hello", "world"]) >>> x.str.endswith('rld').evaluate() 0 False 1 True @@ -218,8 +209,7 @@ def find(self, sub, start=0, end=None): Examples -------- - >>> from weld.grizzly import GrizzlySeries - >>> x = GrizzlySeries(["bigfatcat", "fatcatbig", "reallybigcat"]) + >>> x = gr.GrizzlySeries(["bigfatcat", "fatcatbig", "reallybigcat"]) >>> x.str.find('fat').evaluate() 0 3 1 0 @@ -248,8 +238,7 @@ def replace(self, pat, rep): Examples -------- - >>> from weld.grizzly import GrizzlySeries - >>> x = GrizzlySeries(["hello", "world"]) + >>> x = gr.GrizzlySeries(["hello", "world"]) >>> x.str.replace('o', 'lalala').str.to_pandas() 0 helllalala 1 wlalalarld