From b781d94e8f7c85b32373ff33b77bb56cfc04c20b Mon Sep 17 00:00:00 2001
From: Shoumik Palkar <shoumik@cs.stanford.edu>
Date: Tue, 17 Mar 2020 18:53:11 -0700
Subject: [PATCH 1/9] start string integration

---
 weld-python/weld/grizzly/series.py | 46 +++++++++++++++++++++++++-----
 1 file changed, 39 insertions(+), 7 deletions(-)

diff --git a/weld-python/weld/grizzly/series.py b/weld-python/weld/grizzly/series.py
index fd7245358..db0080c73 100644
--- a/weld-python/weld/grizzly/series.py
+++ b/weld-python/weld/grizzly/series.py
@@ -8,11 +8,11 @@
 import warnings
 
 import weld.encoders.numpy as wenp
+import weld.grizzly.weld.str as weldstr
 
 from pandas.core.internals import SingleBlockManager
 from weld.lazy import PhysicalValue, WeldLazy, WeldNode, identity
 from weld.types import *
-
 from .weld.ops import *
 from .error import *
 
@@ -27,6 +27,23 @@ def _grizzlyseries_constructor_with_fallback(data=None, **kwargs):
     except TypeError:
         return pd.Series(data=data, **kwargs)
 
+class StringMethods(object):
+    """
+    String methods for Grizzly Series.
+
+    """
+
+    def __init__(self, series):
+        # TODO(soumik.palkar): This probably needs to take an extension array, because
+        # Pandas doesn't like the 'S' dtype.
+        if series.dtype.char != 'S':
+            raise ValueError("StringMethods only available for Series with dtype 'S'")
+        self.series = series
+
+    def lower(self):
+        lazy = weldstr.lower(self.weld_value_)(self.output_type, GrizzlySeries._decoder)
+        return GrizzlySeries(lazy, dtype=self.dtype)
+
 class GrizzlySeries(pd.Series):
     """
     A lazy `Series` object backed by a Weld computation.
@@ -212,7 +229,10 @@ def _supports_grizzly(cls, data):
         """
         if not isinstance(data, np.ndarray) or data.ndim != 1:
             return None
-        elem_type = wenp.dtype_to_weld_type(data.dtype)
+        if data.dtype.char == 'S':
+            elem_type = WeldVec(I8())
+        else:
+            elem_type = wenp.dtype_to_weld_type(data.dtype)
         return WeldVec(elem_type) if elem_type is not None else None
 
     # ---------------------- Initialization ------------------------------
@@ -249,17 +269,21 @@ def __new__(cls, data, dtype=None, index=None, **kwargs):
             super(GrizzlySeries, self).__init__(None, dtype=dtype, **kwargs)
             self.weld_value_ = data
             return self
-        elif index is not None and not isinstance(index, pd.RangeIndex):
+
+        if index is not None and not isinstance(index, pd.RangeIndex):
             # TODO(shoumik): This is probably incomplete, since we could have a
             # RangeIndex that does not capture the full span of the data, has a
             # non-zero step, etc.
             return pd.Series(data, dtype=dtype, index=index, **kwargs)
-        elif len(kwargs) != 0:
+
+        if len(kwargs) != 0:
+            # Unsupported arguments present: bail for now.
             return pd.Series(data, dtype=dtype, index=index, **kwargs)
-        elif not isinstance(data, np.ndarray):
+
+        if not isinstance(data, np.ndarray):
             # First, convert the input into a Series backed by an ndarray.
-            s = pd.Series(data, dtype=dtype, index=index, **kwargs)
-            data = s.values
+             s = pd.Series(data, dtype=dtype, index=index, **kwargs)
+             data = s.values
 
         # Try to create a Weld type for the input.
         weld_type = GrizzlySeries._supports_grizzly(data)
@@ -270,9 +294,17 @@ def __new__(cls, data, dtype=None, index=None, **kwargs):
                     PhysicalValue(data, weld_type, GrizzlySeries._encoder),
                     GrizzlySeries._decoder)
             return self
+
         # Don't re-convert values if we did it once already -- it's expensive.
         return s if s is not None else pd.Series(data, dtype=dtype, index=index, **kwargs)
 
+    # ---------------------- StringMethods ------------------------------
+
+    @property
+    def str(self):
+        # TODO(shoumik.palkar): Use pandas.core.accessor.CachedAccessor?
+        return StringMethods(self)
+
     # ---------------------- Indexing ------------------------------
 
     def __setitem__(self, key, value):

From 23802980a4ea50d1dd281f181cd15bf1d3186d52 Mon Sep 17 00:00:00 2001
From: Shoumik Palkar <shoumik@cs.stanford.edu>
Date: Wed, 18 Mar 2020 16:44:08 -0700
Subject: [PATCH 2/9] more string stuff

---
 weld-python/weld/compile.py          |  6 +--
 weld-python/weld/encoders/numpy.py   |  2 +-
 weld-python/weld/grizzly/series.py   | 80 +++++++++++++++++++++++++---
 weld-python/weld/grizzly/weld/str.py | 79 +++++++++++++++++++++++++++
 4 files changed, 155 insertions(+), 12 deletions(-)
 create mode 100644 weld-python/weld/grizzly/weld/str.py

diff --git a/weld-python/weld/compile.py b/weld-python/weld/compile.py
index b5bb436ba..8ef0586d0 100644
--- a/weld-python/weld/compile.py
+++ b/weld-python/weld/compile.py
@@ -81,7 +81,7 @@ def compile(program, arg_types, encoders, restype, decoder, conf=None):
     >>> from weld.types import *
     >>> func = compile("|x: i32| x + 1",
     ...        [I32()],  [None],
-    ...        I32(), None) 
+    ...        I32(), None)
     ...
     >>> func(100)[0]
     101
@@ -92,7 +92,7 @@ def compile(program, arg_types, encoders, restype, decoder, conf=None):
 
     >>> func = compile("|x: i32, y: i32| x + y",
     ...        [I32(), I32()],  [None, None],
-    ...        I32(), None) 
+    ...        I32(), None)
     ...
     >>> func(5, 6)[0]
     11
@@ -101,7 +101,7 @@ def compile(program, arg_types, encoders, restype, decoder, conf=None):
 
     >>> func = compile("|x: i32| x + 1",
     ...        [I32()],  [PrimitiveWeldEncoder()],
-    ...        I32(), PrimitiveWeldDecoder()) 
+    ...        I32(), PrimitiveWeldDecoder())
     ...
     >>> func(100)[0]
     101
diff --git a/weld-python/weld/encoders/numpy.py b/weld-python/weld/encoders/numpy.py
index 2f79593b6..23852f946 100644
--- a/weld-python/weld/encoders/numpy.py
+++ b/weld-python/weld/encoders/numpy.py
@@ -268,7 +268,7 @@ class StringConversionFuncs(object):
     """
 
     stringfuncs = ctypes.PyDLL(weld.encoders._strings.__file__)
-    string_cclass = WeldVec(I8()).ctype_class
+    string_cclass = WeldVec(WeldVec(I8())).ctype_class
 
     @staticmethod
     def numpy_string_array_to_weld(arr):
diff --git a/weld-python/weld/grizzly/series.py b/weld-python/weld/grizzly/series.py
index db0080c73..143b01730 100644
--- a/weld-python/weld/grizzly/series.py
+++ b/weld-python/weld/grizzly/series.py
@@ -19,8 +19,7 @@
 def _grizzlyseries_constructor_with_fallback(data=None, **kwargs):
     """
     A flexible constructor for Series._constructor, which needs to be able
-    to fall back to a Series (if a certain operation does not produce
-    geometries)
+    to fall back to a Series (if a certain operation cannot produce GrizzlySeries).
     """
     try:
         return GrizzlySeries(data=data, **kwargs)
@@ -33,16 +32,75 @@ class StringMethods(object):
 
     """
 
+    __slots__ = [ "series" ]
+
     def __init__(self, series):
-        # TODO(soumik.palkar): This probably needs to take an extension array, because
-        # Pandas doesn't like the 'S' dtype.
         if series.dtype.char != 'S':
             raise ValueError("StringMethods only available for Series with dtype 'S'")
         self.series = series
 
+    def to_pandas(self):
+        """
+        Convert a GrizzlySeries of strings to a Pandas series.
+
+        We provide a specialized implementation of `to_pandas` here that will perform UTF-8 decoding
+        of the raw bytestrings that Grizzly series operate over.
+
+        Examples
+        --------
+        >>> x = GrizzlySeries(["Welcome", "to", "Grizzly!"])
+        >>> x
+        0     b'Welcome'
+        1          b'to'
+        2    b'Grizzly!'
+        dtype: bytes64
+        >>> x.str.to_pandas()
+        0     Welcome
+        1          to
+        2    Grizzly!
+        dtype: object
+
+        """
+        return self.series.evaluate().to_pandas().str.decode("utf-8")
+
+    def _apply(self, func):
+        """
+        Apply the given weldfunc to `self.series` and return a new GrizzlySeries.
+
+        """
+        lazy = func(self.series.weld_value_)(self.series.output_type, GrizzlySeries._decoder)
+        return GrizzlySeries(lazy, dtype='S')
+
     def lower(self):
-        lazy = weldstr.lower(self.weld_value_)(self.output_type, GrizzlySeries._decoder)
-        return GrizzlySeries(lazy, dtype=self.dtype)
+        """
+        Lowercase strings in a GrizzlySeries.
+
+        Examples
+        --------
+        >>> x = GrizzlySeries(["HELLO", "WorLD"])
+        >>> x.str.lower().str.to_pandas()
+        0    hello
+        1    world
+        dtype: object
+
+        """
+        return self._apply(weldstr.lower)
+
+    def upper(self):
+        """
+        Uppercase strings in a GrizzlySeries.
+
+        Examples
+        --------
+        >>> x = GrizzlySeries(["hello", "WorlD"])
+        >>> x.str.upper().str.to_pandas()
+        0    HELLO
+        1    WORLD
+        dtype: object
+
+        """
+        return self._apply(weldstr.upper)
+
 
 class GrizzlySeries(pd.Series):
     """
@@ -266,7 +324,7 @@ def __new__(cls, data, dtype=None, index=None, **kwargs):
         s = None
         if isinstance(data, WeldLazy):
             self = super(GrizzlySeries, cls).__new__(cls)
-            super(GrizzlySeries, self).__init__(None, dtype=dtype, **kwargs)
+            super(GrizzlySeries, self).__init__(np.array([], dtype=dtype), **kwargs)
             self.weld_value_ = data
             return self
 
@@ -280,7 +338,13 @@ def __new__(cls, data, dtype=None, index=None, **kwargs):
             # Unsupported arguments present: bail for now.
             return pd.Series(data, dtype=dtype, index=index, **kwargs)
 
-        if not isinstance(data, np.ndarray):
+        if isinstance(data, list) and len(data) > 0 and isinstance(data[0], str):
+            # Try to convert a list of strings into a supported Numpy array.
+            data = np.array(data, dtype='S')
+
+        if isinstance(data, pd.Series):
+            data = data.values
+        elif not isinstance(data, np.ndarray):
             # First, convert the input into a Series backed by an ndarray.
              s = pd.Series(data, dtype=dtype, index=index, **kwargs)
              data = s.values
diff --git a/weld-python/weld/grizzly/weld/str.py b/weld-python/weld/grizzly/weld/str.py
new file mode 100644
index 000000000..30025fe1e
--- /dev/null
+++ b/weld-python/weld/grizzly/weld/str.py
@@ -0,0 +1,79 @@
+"""
+String functions exported as weldfuncs.
+
+Each function takes an argument representing an array of strings, and outputs a
+program that applies some transformation on each string. The functions are annotated
+with `weld.lazy.weldfunc`, so they accept `WeldLazy` objects and return functions
+for constructing Weld programs.
+
+"""
+
+import weld.lazy
+
+@weld.lazy.weldfunc
+def lower(stringarr):
+    """
+    Convert values to lowercase.
+
+    """
+    return """map(
+    {stringarr},
+    |e: vec[i8]|
+        result(
+            for(e,
+                appender[i8],
+                |c: appender[i8], j: i64, f: i8|
+                    if(f > 64c && f < 91c,
+                        merge(c, f + 32c),
+                        merge(c, f))
+            )
+        )
+    )""".format(stringarr=stringarr)
+
+
+@weld.lazy.weldfunc
+def upper(stringarr):
+    """
+    Convert values to uppercase.
+
+    """
+    return """map(
+    {stringarr},
+    |e: vec[i8]|
+        result(
+            for(e,
+                appender[i8],
+                |c: appender[i8], j: i64, f: i8|
+                    if(f > 96c && f < 123c,
+                        merge(c, f - 32c),
+                        merge(c, f))
+            )
+        )
+    )""".format(stringarr=stringarr)
+
+@weld.lazy.weldfunc
+def weld_str_capitalize(stringarr):
+    """
+    Capitalize first letter.
+
+    """
+    return """map(
+    {stringarr},
+    |e: vec[i8]|
+        let lenString = len(e);
+        if(lenString > 0L,
+            let res = appender[i8];
+            let firstChar = lookup(e, 0L);
+            let res = if(firstChar > 96c && firstChar < 123c, merge(res, firstChar - 32c), merge(res, firstChar));
+            result(
+                for(slice(e, 1L, lenString - 1L),
+                    res,
+                    |c: appender[i8], j: i64, f: i8|
+                        if(f > 64c && f < 91c,
+                            merge(c, f + 32c),
+                            merge(c, f)
+                        )
+                )
+            ),
+            e)
+    )""".format(stringarr=stringarr)

From f7ef0cd643d2809503bd3f8f139158e72b63772f Mon Sep 17 00:00:00 2001
From: Shoumik Palkar <shoumik@cs.stanford.edu>
Date: Wed, 18 Mar 2020 18:37:35 -0700
Subject: [PATCH 3/9] add some string methods

---
 weld-python/weld/compile.py          |  1 +
 weld-python/weld/grizzly/series.py   | 63 +++++++++++++++++++++++++---
 weld-python/weld/grizzly/weld/str.py | 48 ++++++++++++++++++++-
 weld/src/lib.rs                      |  2 +-
 4 files changed, 107 insertions(+), 7 deletions(-)

diff --git a/weld-python/weld/compile.py b/weld-python/weld/compile.py
index 8ef0586d0..160048c79 100644
--- a/weld-python/weld/compile.py
+++ b/weld-python/weld/compile.py
@@ -158,6 +158,7 @@ def func(*args, context=None):
         raw_args_pointer = ctypes.addressof(raw_args)
         value = WeldValue(raw_args_pointer)
 
+
         if context is None:
             context = WeldContext(conf)
 
diff --git a/weld-python/weld/grizzly/series.py b/weld-python/weld/grizzly/series.py
index 143b01730..0be2eb8c5 100644
--- a/weld-python/weld/grizzly/series.py
+++ b/weld-python/weld/grizzly/series.py
@@ -28,7 +28,9 @@ def _grizzlyseries_constructor_with_fallback(data=None, **kwargs):
 
 class StringMethods(object):
     """
-    String methods for Grizzly Series.
+    String methods for Grizzly. Currently, string methods only apply to ASCII
+    strings; while users can pass UTF-8 strings into Grizzly, their codepoints
+    will be ignored by the below operations and will be returned unmodified.
 
     """
 
@@ -63,17 +65,17 @@ def to_pandas(self):
         """
         return self.series.evaluate().to_pandas().str.decode("utf-8")
 
-    def _apply(self, func):
+    def _apply(self, func, *args):
         """
         Apply the given weldfunc to `self.series` and return a new GrizzlySeries.
 
         """
-        lazy = func(self.series.weld_value_)(self.series.output_type, GrizzlySeries._decoder)
+        lazy = func(self.series.weld_value_, *args)(self.series.output_type, GrizzlySeries._decoder)
         return GrizzlySeries(lazy, dtype='S')
 
     def lower(self):
         """
-        Lowercase strings in a GrizzlySeries.
+        Lowercase strings.
 
         Examples
         --------
@@ -88,7 +90,7 @@ def lower(self):
 
     def upper(self):
         """
-        Uppercase strings in a GrizzlySeries.
+        Uppercase strings.
 
         Examples
         --------
@@ -101,6 +103,57 @@ def upper(self):
         """
         return self._apply(weldstr.upper)
 
+    def capitalize(self):
+        """
+        Capitalize the first character in each string.
+
+        Examples
+        --------
+        >>> x = GrizzlySeries(["hello", "worlD"])
+        >>> x.str.capitalize().str.to_pandas()
+        0    Hello
+        1    World
+        dtype: object
+
+        """
+        return self._apply(weldstr.capitalize)
+
+    def get(self, index):
+        """
+        Get the character at index 'i' from each string. If 'index' is greater than
+        the string length, this returns an empty string. If 'index' is less than 0,
+        this wraps around, using Python's indexing behavior.
+
+        Examples
+        --------
+        >>> x = GrizzlySeries(["hello", "worlD"])
+        >>> x.str.get(4).str.to_pandas()
+        0    o
+        1    D
+        dtype: object
+        >>> x.str.get(-3).str.to_pandas()
+        0    l
+        1    r
+        dtype: object
+
+        """
+        return self._apply(weldstr.get, index)
+
+    def strip(self):
+        """
+        Strip whitespace from the string.
+
+        Examples
+        --------
+        >>> x = GrizzlySeries(["     hello   ", "   world    \t  "])
+        >>> x.str.strip().str.to_pandas()
+        0    hello
+        1    world
+        dtype: object
+
+        """
+        return self._apply(weldstr.strip)
+
 
 class GrizzlySeries(pd.Series):
     """
diff --git a/weld-python/weld/grizzly/weld/str.py b/weld-python/weld/grizzly/weld/str.py
index 30025fe1e..46199ab84 100644
--- a/weld-python/weld/grizzly/weld/str.py
+++ b/weld-python/weld/grizzly/weld/str.py
@@ -52,7 +52,7 @@ def upper(stringarr):
     )""".format(stringarr=stringarr)
 
 @weld.lazy.weldfunc
-def weld_str_capitalize(stringarr):
+def capitalize(stringarr):
     """
     Capitalize first letter.
 
@@ -77,3 +77,49 @@ def weld_str_capitalize(stringarr):
             ),
             e)
     )""".format(stringarr=stringarr)
+
+
+@weld.lazy.weldfunc
+def get(stringarr, i):
+    """
+    Retrieves the character at index 'i'.
+
+    If 'i' is greater than the string length, returns '\0'.
+
+    """
+    i = "i64({})".format(i)
+    return """map(
+    {stringarr},
+    |e: vec[i8]|
+        let lenString = len(e);
+        if({i} >= lenString,
+            [0c],
+            if({i} > 0L,
+                result(merge(appender[i8], lookup(slice(e, 0L, lenString), {i}))),
+                if ({i} > -lenString,
+                    result(merge(appender[i8], lookup(slice(e, lenString, {i}), {i}))),
+                    [0c]
+                )
+            )
+        )
+    )""".format(stringarr=stringarr, i=i)
+
+
+@weld.lazy.weldfunc
+def strip(stringarr):
+    """
+    Strip whitespace from the start of each string.
+
+    """
+    # From https://en.wikipedia.org/wiki/Whitespace_character.
+    is_whitespace = "((lookup(e, p) == 32c) || (lookup(e, p)  >= 9c && lookup(e, p) <= 13c))"
+    # +3L = +1 compensate start_i already +1'ed, +1 compensate end_i already -1'ed, +1 compensate for slice with size
+    return """map(
+    {stringarr},
+    |e: vec[i8]|
+        let lenString = len(e);
+        let start_i = iterate(0L, |p| {{p + 1L, p < lenString && {is_whitespace}}});
+        let end_i = iterate(lenString - 1L, |p| {{p - 1L, p > start_i && {is_whitespace}}});
+        # slice(e, start_i - 1L, lenString - start_i + 1L)
+        slice(e, start_i - 1L, end_i - start_i + 3L)
+    )""".format(stringarr=stringarr, is_whitespace=is_whitespace)
diff --git a/weld/src/lib.rs b/weld/src/lib.rs
index cbfd717ff..4c267a1e6 100644
--- a/weld/src/lib.rs
+++ b/weld/src/lib.rs
@@ -713,7 +713,7 @@ impl WeldModule {
             use crate::sir::optimizations;
             info!("Applying SIR optimizations");
             optimizations::fold_constants::fold_constants(&mut sir_prog)?;
-            optimizations::simplify_assignments::simplify_assignments(&mut sir_prog)?;
+            // optimizations::simplify_assignments::simplify_assignments(&mut sir_prog)?;
         }
         let end = PreciseTime::now();
         debug!("Optimized SIR program:\n{}\n", &sir_prog);

From 67081c6a42fd8f1fff1061b4d02b7e853f38be5d Mon Sep 17 00:00:00 2001
From: Shoumik Palkar <shoumik@cs.stanford.edu>
Date: Thu, 19 Mar 2020 20:55:36 -0700
Subject: [PATCH 4/9] add contains implementation

---
 weld-python/weld/grizzly/series.py   | 34 ++++++++++++++---
 weld-python/weld/grizzly/weld/str.py | 56 +++++++++++++++++++++++++++-
 2 files changed, 84 insertions(+), 6 deletions(-)

diff --git a/weld-python/weld/grizzly/series.py b/weld-python/weld/grizzly/series.py
index 0be2eb8c5..1d1f9b019 100644
--- a/weld-python/weld/grizzly/series.py
+++ b/weld-python/weld/grizzly/series.py
@@ -65,13 +65,18 @@ def to_pandas(self):
         """
         return self.series.evaluate().to_pandas().str.decode("utf-8")
 
-    def _apply(self, func, *args):
+    def _apply(self, func, *args, return_weld_elem_type=None):
         """
         Apply the given weldfunc to `self.series` and return a new GrizzlySeries.
 
+        If the return type of the result is not a string GrizzlySeries, pass
+        'return_weld_elem_type' to specify the element type of the result.
+
         """
-        lazy = func(self.series.weld_value_, *args)(self.series.output_type, GrizzlySeries._decoder)
-        return GrizzlySeries(lazy, dtype='S')
+        output_type = self.series.output_type if return_weld_elem_type is None else WeldVec(return_weld_elem_type)
+        dtype = 'S' if return_weld_elem_type is None else wenp.weld_type_to_dtype(return_weld_elem_type)
+        lazy = func(self.series.weld_value_, *args)(output_type, GrizzlySeries._decoder)
+        return GrizzlySeries(lazy, dtype=dtype)
 
     def lower(self):
         """
@@ -154,6 +159,25 @@ def strip(self):
         """
         return self._apply(weldstr.strip)
 
+    def contains(self, pat):
+        """
+        Returns whether each string contains the provided pattern.
+
+        Pattern must be a Python string.
+
+        Examples
+        --------
+        >>> x = GrizzlySeries(["hello", "world"])
+        >>> x.str.contains('wor').evaluate()
+        0    False
+        1     True
+        dtype: bool
+
+        """
+        if not isinstance(pat, str):
+            raise TypeError("pattern in contains must be a Python 'str'")
+        return self._apply(weldstr.contains, pat, return_weld_elem_type=Bool())
+
 
 class GrizzlySeries(pd.Series):
     """
@@ -367,9 +391,9 @@ def __new__(cls, data, dtype=None, index=None, **kwargs):
         >>> x = GrizzlySeries(np.ones(5))
         >>> x.__class__
         <class 'weld.grizzly.series.GrizzlySeries'>
-        >>> y = GrizzlySeries(['hi', 'bye']) # Unsupported
+        >>> y = GrizzlySeries(['hi', 'bye'])
         >>> y.__class__
-        <class 'pandas.core.series.Series'>
+        <class 'weld.grizzly.series.GrizzlySeries'>
         >>> y = GrizzlySeries([1, 2, 3], index=[1, 0, 2]) # Unsupported
         >>> y.__class__
         <class 'pandas.core.series.Series'>
diff --git a/weld-python/weld/grizzly/weld/str.py b/weld-python/weld/grizzly/weld/str.py
index 46199ab84..c1bbb4194 100644
--- a/weld-python/weld/grizzly/weld/str.py
+++ b/weld-python/weld/grizzly/weld/str.py
@@ -10,6 +10,18 @@
 
 import weld.lazy
 
+def string_to_weld_literal(s):
+    """
+    Converts a string to a UTF-8 encoded Weld literal byte-vector.
+
+    Examples
+    --------
+    >>> string_to_weld_literal('hello')
+    '[104c,101c,108c,108c,111c]'
+
+    """
+    return "[" + ",".join([str(b) + 'c' for b in list(s.encode('utf-8'))]) + "]"
+
 @weld.lazy.weldfunc
 def lower(stringarr):
     """
@@ -120,6 +132,48 @@ def strip(stringarr):
         let lenString = len(e);
         let start_i = iterate(0L, |p| {{p + 1L, p < lenString && {is_whitespace}}});
         let end_i = iterate(lenString - 1L, |p| {{p - 1L, p > start_i && {is_whitespace}}});
-        # slice(e, start_i - 1L, lenString - start_i + 1L)
         slice(e, start_i - 1L, end_i - start_i + 3L)
     )""".format(stringarr=stringarr, is_whitespace=is_whitespace)
+
+@weld.lazy.weldfunc
+def contains(stringarr, pat):
+    """
+    Check whether each element contains the substring 'pat', and returns
+    a boolean array of the results.
+
+    For now, 'pat' must be a string literal.
+    """
+    define_pat = "let pat = {};".format(string_to_weld_literal(pat))
+
+    return """
+    {define_pat}
+    let lenPat = len(pat);
+        map({stringarr},
+            |e: vec[i8]|
+                let lenString = len(e);
+                if(lenPat > lenString,
+                    false,
+                    # start by assuming pat is not found, until proven it is
+                    let words_iter_res = iterate({{0L, false}},
+                        |p|
+                            let e_i = p.$0;
+                            let pat_i = 0L;
+                            # start by assuming the substring and pat are the same, until proven otherwise
+                            let word_check_res = iterate({{e_i, pat_i, true}},
+                                |q|
+                                    let found = lookup(e, q.$0) == lookup(pat, q.$1);
+                                    {{
+                                        {{q.$0 + 1L, q.$1 + 1L, found}},
+                                        q.$1 + 1L < lenPat &&
+                                        found == true
+                                    }}
+                            ).$2;
+                            {{
+                                {{p.$0 + 1L, word_check_res}},
+                                p.$0 + lenPat < lenString &&
+                                word_check_res == false
+                            }}
+                    ).$1;
+                    words_iter_res
+                )
+        )""".format(stringarr=stringarr, define_pat=define_pat)

From 4346ad37c6561310c78717a23072f6426287d984 Mon Sep 17 00:00:00 2001
From: Shoumik Palkar <shoumik@cs.stanford.edu>
Date: Fri, 20 Mar 2020 17:02:25 -0700
Subject: [PATCH 5/9] more string functions

---
 weld-python/weld/grizzly/series.py   |  87 ++++++++++++
 weld-python/weld/grizzly/weld/str.py | 196 +++++++++++++++++++++++++++
 2 files changed, 283 insertions(+)

diff --git a/weld-python/weld/grizzly/series.py b/weld-python/weld/grizzly/series.py
index 1d1f9b019..3c4b69207 100644
--- a/weld-python/weld/grizzly/series.py
+++ b/weld-python/weld/grizzly/series.py
@@ -178,6 +178,93 @@ def contains(self, pat):
             raise TypeError("pattern in contains must be a Python 'str'")
         return self._apply(weldstr.contains, pat, return_weld_elem_type=Bool())
 
+    def startswith(self, pat):
+        """
+        Returns whether each string starts with the provided pattern.
+
+        Pattern must be a Python string.
+
+        Examples
+        --------
+        >>> x = GrizzlySeries(["hello", "world"])
+        >>> x.str.startswith('wo').evaluate()
+        0    False
+        1     True
+        dtype: bool
+
+        """
+        if not isinstance(pat, str):
+            raise TypeError("pattern in startswith must be a Python 'str'")
+        return self._apply(weldstr.startswith, pat, return_weld_elem_type=Bool())
+
+    def endswith(self, pat):
+        """
+        Returns whether each string starts with the provided pattern.
+
+        Pattern must be a Python string.
+
+        Examples
+        --------
+        >>> x = GrizzlySeries(["hello", "world"])
+        >>> x.str.endswith('rld').evaluate()
+        0    False
+        1     True
+        dtype: bool
+
+        """
+        if not isinstance(pat, str):
+            raise TypeError("pattern in endswith must be a Python 'str'")
+        return self._apply(weldstr.endswith, pat, return_weld_elem_type=Bool())
+
+    def find(self, sub, start=0, end=None):
+        """
+        Find 'sub' in each string. Each string is searched in the range [start,end].
+
+        'sub' must be a Python string, and 'start' and 'end' must be Python integers.
+
+        Examples
+        --------
+        >>> x = GrizzlySeries(["bigfatcat", "fatcatbig", "reallybigcat"])
+        >>> x.str.find('fat').evaluate()
+        0    3
+        1    0
+        2   -1
+        dtype: int64
+        >>> x.str.find('big', end=2).evaluate()
+        0    0
+        1   -1
+        2   -1
+        dtype: int64
+
+        """
+        if not isinstance(sub, str):
+            raise TypeError("sub in find must be a Python 'str'")
+        if not isinstance(start, int):
+            raise TypeError("start in find must be a Python 'int'")
+        if end is not None and not isinstance(end, int):
+            raise TypeError("end in find must be a Python 'int'")
+        return self._apply(weldstr.find, sub, start, end, return_weld_elem_type=I64())
+
+    def replace(self, pat, rep):
+        """
+        Replaces the first occurrence of 'pat' with 'rep' in each string.
+
+        Pattern and replacement must be Python strings.
+
+        Examples
+        --------
+        >>> x = GrizzlySeries(["hello", "world"])
+        >>> x.str.replace('o', 'lalala').str.to_pandas()
+        0    helllalala
+        1    wlalalarld
+        dtype: object
+
+        """
+        if not isinstance(pat, str):
+            raise TypeError("pattern in replace must be a Python 'str'")
+        if not isinstance(rep, str):
+            raise TypeError("replacement in replace must be a Python 'str'")
+        return self._apply(weldstr.replace, pat, rep)
 
 class GrizzlySeries(pd.Series):
     """
diff --git a/weld-python/weld/grizzly/weld/str.py b/weld-python/weld/grizzly/weld/str.py
index c1bbb4194..0565d0052 100644
--- a/weld-python/weld/grizzly/weld/str.py
+++ b/weld-python/weld/grizzly/weld/str.py
@@ -6,6 +6,11 @@
 with `weld.lazy.weldfunc`, so they accept `WeldLazy` objects and return functions
 for constructing Weld programs.
 
+These are adapted from
+https://github.com/weld-project/baloo/blob/master/baloo/weld/weld_str.py.
+
+We may choose to re-implement these as UDF calls to Rust's UTF-8 library in the future.
+
 """
 
 import weld.lazy
@@ -177,3 +182,194 @@ def contains(stringarr, pat):
                     words_iter_res
                 )
         )""".format(stringarr=stringarr, define_pat=define_pat)
+
+@weld.lazy.weldfunc
+def startswith(stringarr, pat):
+    """
+    Check whether each element starts with the substring 'pat', and returns
+    a boolean array of the results.
+
+    For now, 'pat' must be a string literal.
+
+    """
+    define_pat = "let pat = {};".format(string_to_weld_literal(pat))
+
+    return """
+    {define_pat}
+    let lenPat = len(pat);
+    map({stringarr},
+        |e: vec[i8]|
+            let lenString = len(e);
+            if(lenPat > lenString,
+                false,
+                iterate({{0L, true}},
+                    |q|
+                        let found = lookup(e, q.$0) == lookup(pat, q.$0);
+                        {{
+                            {{q.$0 + 1L, found}},
+                            q.$0 + 1L < lenPat &&
+                            found == true
+                        }}
+                ).$1
+            )
+    )""".format(stringarr=stringarr, define_pat=define_pat)
+
+
+@weld.lazy.weldfunc
+def endswith(stringarr, pat):
+    """
+    Check whether each element ends with the substring 'pat', and returns
+    a boolean array of the results.
+
+    For now, 'pat' must be a string literal.
+
+    """
+    define_pat = "let pat = {};".format(string_to_weld_literal(pat))
+
+    return """
+    {define_pat}
+    let lenPat = len(pat);
+    map({stringarr},
+        |e: vec[i8]|
+            let lenString = len(e);
+            if(lenPat > lenString,
+                false,
+                iterate({{lenString - lenPat, 0L, true}},
+                    |q|
+                        let found = lookup(e, q.$0) == lookup(pat, q.$1);
+                        {{
+                            {{q.$0 + 1L, q.$1 + 1L, found}},
+                            q.$1 + 1L < lenPat &&
+                            found == true
+                        }}
+                ).$2
+            )
+    )""".format(stringarr=stringarr, define_pat=define_pat)
+
+@weld.lazy.weldfunc
+def find(stringarr, sub, start, end=None):
+    """
+    Searches for 'sub' in each string in the range 'start', 'end'.  Returns a
+    i64 array with -1 for unfound strings, or the index of the found string.
+
+    'sub' must be a Python string. 'start' and 'end' must be integers.
+
+    """
+
+    start = "i64({})".format(start)
+    if end is None:
+        # -1L since we add 1L below to make the range inclusive in the size.
+        end = 'len(e) - 1L'
+    else:
+        end = "i64({})".format(end)
+
+    define_sub = "let sub = {};".format(string_to_weld_literal(sub))
+
+    return """
+    {define_sub}
+    let lenSub = len(sub);
+    map({stringarr},
+        |e: vec[i8]|
+            let start = {start};
+            let size = {end} - start + 1L; # Search [start:end], so add 1L.
+            if (start < 0L || start + size > len(e),
+                -1L,
+                let string = slice(e, start, size);
+                let lenString = len(string);
+                if(lenSub > lenString,
+                    -1L,
+                    # start by assuming sub is not found, until proven it is
+                    let words_iter_res = iterate({{0L, false}},
+                        |p|
+                            let e_i = p.$0;
+                            let pat_i = 0L;
+                            # start by assuming the substring and sub are the same, until proven otherwise
+                            let word_check_res = iterate({{e_i, pat_i, true}},
+                                |q|
+                                    let found = lookup(string, q.$0) == lookup(sub, q.$1);
+                                    {{
+                                        {{q.$0 + 1L, q.$1 + 1L, found}},
+                                        q.$1 + 1L < lenSub &&
+                                        found == true
+                                    }}
+                            ).$2;
+                            {{
+                                {{p.$0 + 1L, word_check_res}},
+                                p.$0 + lenSub < lenString &&
+                                word_check_res == false
+                            }}
+                    );
+                    if(words_iter_res.$1 == true,
+                        words_iter_res.$0 - 1L + start,
+                        -1L
+                    )
+                )
+            )
+    )""".format(stringarr=stringarr, define_sub=define_sub, start=start, end=end)
+
+
+@weld.lazy.weldfunc
+def replace(stringarr, pat, rep):
+    """
+    Replace the first occurrence iof 'pat' in each string with with 'rep'.
+
+    For now, 'pat' and 'rep' must be Python strings.
+
+    """
+    define_pat = "let pat = {};".format(string_to_weld_literal(pat))
+    define_rep = "let rep = {};".format(string_to_weld_literal(rep))
+
+    return  """
+    {define_pat}
+    {define_rep}
+    let lenPat = len(pat);
+    map({stringarr},
+        |e: vec[i8]|
+            let lenString = len(e);
+            if(lenPat > lenString,
+                e,
+                # start by assuming sub is not found, until proven it is
+                let words_iter_res = iterate({{0L, false}},
+                    |p|
+                        let e_i = p.$0;
+                        let pat_i = 0L;
+                        # start by assuming the substring and sub are the same, until proven otherwise
+                        let word_check_res = iterate({{e_i, pat_i, true}},
+                            |q|
+                                let found = lookup(e, q.$0) == lookup(pat, q.$1);
+                                {{
+                                    {{q.$0 + 1L, q.$1 + 1L, found}},
+                                    q.$1 + 1L < lenPat &&
+                                    found == true
+                                }}
+                        ).$2;
+                        {{
+                            {{p.$0 + 1L, word_check_res}},
+                            p.$0 + lenPat < lenString &&
+                            word_check_res == false
+                        }}
+                );
+                if(words_iter_res.$1 == true,
+                    let rep_from = words_iter_res.$0 - 1L;
+                    let rep_to = rep_from + lenPat;
+                    let res = appender[i8];
+                    let res = for(slice(e, 0L, rep_from),
+                        res,
+                        |c: appender[i8], j: i64, f: i8|
+                            merge(c, f)
+                    );
+                    let res = for(rep,
+                        res,
+                        |c: appender[i8], j: i64, f: i8|
+                            merge(c, f)
+                    );
+                    let res = for(slice(e, rep_to, lenString),
+                        res,
+                        |c: appender[i8], j: i64, f: i8|
+                            merge(c, f)
+                    );
+                    result(res),
+                    e
+                )
+            )
+    )""".format(stringarr=stringarr, define_pat=define_pat, define_rep=define_rep)

From 3832175ae76a600b94ae6c3cb51d84ecd5d41f20 Mon Sep 17 00:00:00 2001
From: Shoumik Palkar <shoumik@cs.stanford.edu>
Date: Fri, 20 Mar 2020 17:31:22 -0700
Subject: [PATCH 6/9] move code around, add some more string functions

---
 weld-python/tests/grizzly/test_series.py      |   4 +-
 weld-python/weld/grizzly/__init__.py          |   2 +
 weld-python/weld/grizzly/core/__init__.py     |   1 +
 weld-python/weld/grizzly/{ => core}/error.py  |   0
 weld-python/weld/grizzly/{ => core}/series.py | 254 +----------------
 weld-python/weld/grizzly/core/strings.py      | 263 ++++++++++++++++++
 6 files changed, 275 insertions(+), 249 deletions(-)
 create mode 100644 weld-python/weld/grizzly/core/__init__.py
 rename weld-python/weld/grizzly/{ => core}/error.py (100%)
 rename weld-python/weld/grizzly/{ => core}/series.py (74%)
 create mode 100644 weld-python/weld/grizzly/core/strings.py

diff --git a/weld-python/tests/grizzly/test_series.py b/weld-python/tests/grizzly/test_series.py
index 3cece7fba..351dfc8ee 100644
--- a/weld-python/tests/grizzly/test_series.py
+++ b/weld-python/tests/grizzly/test_series.py
@@ -6,7 +6,7 @@
 import numpy as np
 import pandas as pd
 import pytest
-import weld.grizzly.series as gr
+import weld.grizzly as gr
 
 types_ = ['int8', 'uint8', 'int16', 'uint16', 'int32',\
         'uint32', 'int64', 'uint64', 'float32', 'float64']
@@ -134,7 +134,7 @@ def test_indexing():
 
 def test_unsupported_binop_error():
     # Test unsupported
-    from weld.grizzly.error import GrizzlyError
+    from weld.grizzly.core.error import GrizzlyError
     with pytest.raises(GrizzlyError):
         a = gr.GrizzlySeries([1,2,3])
         b = pd.Series([1,2,3])
diff --git a/weld-python/weld/grizzly/__init__.py b/weld-python/weld/grizzly/__init__.py
index e69de29bb..963902a18 100644
--- a/weld-python/weld/grizzly/__init__.py
+++ b/weld-python/weld/grizzly/__init__.py
@@ -0,0 +1,2 @@
+
+from weld.grizzly.core.series import GrizzlySeries
diff --git a/weld-python/weld/grizzly/core/__init__.py b/weld-python/weld/grizzly/core/__init__.py
new file mode 100644
index 000000000..8b1378917
--- /dev/null
+++ b/weld-python/weld/grizzly/core/__init__.py
@@ -0,0 +1 @@
+
diff --git a/weld-python/weld/grizzly/error.py b/weld-python/weld/grizzly/core/error.py
similarity index 100%
rename from weld-python/weld/grizzly/error.py
rename to weld-python/weld/grizzly/core/error.py
diff --git a/weld-python/weld/grizzly/series.py b/weld-python/weld/grizzly/core/series.py
similarity index 74%
rename from weld-python/weld/grizzly/series.py
rename to weld-python/weld/grizzly/core/series.py
index 3c4b69207..0b5b2b124 100644
--- a/weld-python/weld/grizzly/series.py
+++ b/weld-python/weld/grizzly/core/series.py
@@ -10,11 +10,11 @@
 import weld.encoders.numpy as wenp
 import weld.grizzly.weld.str as weldstr
 
-from pandas.core.internals import SingleBlockManager
 from weld.lazy import PhysicalValue, WeldLazy, WeldNode, identity
+from weld.grizzly.weld.ops import *
+from weld.grizzly.core.error import GrizzlyError
+from weld.grizzly.core.strings import StringMethods
 from weld.types import *
-from .weld.ops import *
-from .error import *
 
 def _grizzlyseries_constructor_with_fallback(data=None, **kwargs):
     """
@@ -26,246 +26,6 @@ def _grizzlyseries_constructor_with_fallback(data=None, **kwargs):
     except TypeError:
         return pd.Series(data=data, **kwargs)
 
-class StringMethods(object):
-    """
-    String methods for Grizzly. Currently, string methods only apply to ASCII
-    strings; while users can pass UTF-8 strings into Grizzly, their codepoints
-    will be ignored by the below operations and will be returned unmodified.
-
-    """
-
-    __slots__ = [ "series" ]
-
-    def __init__(self, series):
-        if series.dtype.char != 'S':
-            raise ValueError("StringMethods only available for Series with dtype 'S'")
-        self.series = series
-
-    def to_pandas(self):
-        """
-        Convert a GrizzlySeries of strings to a Pandas series.
-
-        We provide a specialized implementation of `to_pandas` here that will perform UTF-8 decoding
-        of the raw bytestrings that Grizzly series operate over.
-
-        Examples
-        --------
-        >>> x = GrizzlySeries(["Welcome", "to", "Grizzly!"])
-        >>> x
-        0     b'Welcome'
-        1          b'to'
-        2    b'Grizzly!'
-        dtype: bytes64
-        >>> x.str.to_pandas()
-        0     Welcome
-        1          to
-        2    Grizzly!
-        dtype: object
-
-        """
-        return self.series.evaluate().to_pandas().str.decode("utf-8")
-
-    def _apply(self, func, *args, return_weld_elem_type=None):
-        """
-        Apply the given weldfunc to `self.series` and return a new GrizzlySeries.
-
-        If the return type of the result is not a string GrizzlySeries, pass
-        'return_weld_elem_type' to specify the element type of the result.
-
-        """
-        output_type = self.series.output_type if return_weld_elem_type is None else WeldVec(return_weld_elem_type)
-        dtype = 'S' if return_weld_elem_type is None else wenp.weld_type_to_dtype(return_weld_elem_type)
-        lazy = func(self.series.weld_value_, *args)(output_type, GrizzlySeries._decoder)
-        return GrizzlySeries(lazy, dtype=dtype)
-
-    def lower(self):
-        """
-        Lowercase strings.
-
-        Examples
-        --------
-        >>> x = GrizzlySeries(["HELLO", "WorLD"])
-        >>> x.str.lower().str.to_pandas()
-        0    hello
-        1    world
-        dtype: object
-
-        """
-        return self._apply(weldstr.lower)
-
-    def upper(self):
-        """
-        Uppercase strings.
-
-        Examples
-        --------
-        >>> x = GrizzlySeries(["hello", "WorlD"])
-        >>> x.str.upper().str.to_pandas()
-        0    HELLO
-        1    WORLD
-        dtype: object
-
-        """
-        return self._apply(weldstr.upper)
-
-    def capitalize(self):
-        """
-        Capitalize the first character in each string.
-
-        Examples
-        --------
-        >>> x = GrizzlySeries(["hello", "worlD"])
-        >>> x.str.capitalize().str.to_pandas()
-        0    Hello
-        1    World
-        dtype: object
-
-        """
-        return self._apply(weldstr.capitalize)
-
-    def get(self, index):
-        """
-        Get the character at index 'i' from each string. If 'index' is greater than
-        the string length, this returns an empty string. If 'index' is less than 0,
-        this wraps around, using Python's indexing behavior.
-
-        Examples
-        --------
-        >>> x = GrizzlySeries(["hello", "worlD"])
-        >>> x.str.get(4).str.to_pandas()
-        0    o
-        1    D
-        dtype: object
-        >>> x.str.get(-3).str.to_pandas()
-        0    l
-        1    r
-        dtype: object
-
-        """
-        return self._apply(weldstr.get, index)
-
-    def strip(self):
-        """
-        Strip whitespace from the string.
-
-        Examples
-        --------
-        >>> x = GrizzlySeries(["     hello   ", "   world    \t  "])
-        >>> x.str.strip().str.to_pandas()
-        0    hello
-        1    world
-        dtype: object
-
-        """
-        return self._apply(weldstr.strip)
-
-    def contains(self, pat):
-        """
-        Returns whether each string contains the provided pattern.
-
-        Pattern must be a Python string.
-
-        Examples
-        --------
-        >>> x = GrizzlySeries(["hello", "world"])
-        >>> x.str.contains('wor').evaluate()
-        0    False
-        1     True
-        dtype: bool
-
-        """
-        if not isinstance(pat, str):
-            raise TypeError("pattern in contains must be a Python 'str'")
-        return self._apply(weldstr.contains, pat, return_weld_elem_type=Bool())
-
-    def startswith(self, pat):
-        """
-        Returns whether each string starts with the provided pattern.
-
-        Pattern must be a Python string.
-
-        Examples
-        --------
-        >>> x = GrizzlySeries(["hello", "world"])
-        >>> x.str.startswith('wo').evaluate()
-        0    False
-        1     True
-        dtype: bool
-
-        """
-        if not isinstance(pat, str):
-            raise TypeError("pattern in startswith must be a Python 'str'")
-        return self._apply(weldstr.startswith, pat, return_weld_elem_type=Bool())
-
-    def endswith(self, pat):
-        """
-        Returns whether each string starts with the provided pattern.
-
-        Pattern must be a Python string.
-
-        Examples
-        --------
-        >>> x = GrizzlySeries(["hello", "world"])
-        >>> x.str.endswith('rld').evaluate()
-        0    False
-        1     True
-        dtype: bool
-
-        """
-        if not isinstance(pat, str):
-            raise TypeError("pattern in endswith must be a Python 'str'")
-        return self._apply(weldstr.endswith, pat, return_weld_elem_type=Bool())
-
-    def find(self, sub, start=0, end=None):
-        """
-        Find 'sub' in each string. Each string is searched in the range [start,end].
-
-        'sub' must be a Python string, and 'start' and 'end' must be Python integers.
-
-        Examples
-        --------
-        >>> x = GrizzlySeries(["bigfatcat", "fatcatbig", "reallybigcat"])
-        >>> x.str.find('fat').evaluate()
-        0    3
-        1    0
-        2   -1
-        dtype: int64
-        >>> x.str.find('big', end=2).evaluate()
-        0    0
-        1   -1
-        2   -1
-        dtype: int64
-
-        """
-        if not isinstance(sub, str):
-            raise TypeError("sub in find must be a Python 'str'")
-        if not isinstance(start, int):
-            raise TypeError("start in find must be a Python 'int'")
-        if end is not None and not isinstance(end, int):
-            raise TypeError("end in find must be a Python 'int'")
-        return self._apply(weldstr.find, sub, start, end, return_weld_elem_type=I64())
-
-    def replace(self, pat, rep):
-        """
-        Replaces the first occurrence of 'pat' with 'rep' in each string.
-
-        Pattern and replacement must be Python strings.
-
-        Examples
-        --------
-        >>> x = GrizzlySeries(["hello", "world"])
-        >>> x.str.replace('o', 'lalala').str.to_pandas()
-        0    helllalala
-        1    wlalalarld
-        dtype: object
-
-        """
-        if not isinstance(pat, str):
-            raise TypeError("pattern in replace must be a Python 'str'")
-        if not isinstance(rep, str):
-            raise TypeError("replacement in replace must be a Python 'str'")
-        return self._apply(weldstr.replace, pat, rep)
-
 class GrizzlySeries(pd.Series):
     """
     A lazy `Series` object backed by a Weld computation.
@@ -353,7 +113,7 @@ def values(self):
         >>> x.values
         Traceback (most recent call last):
         ...
-        weld.grizzly.error.GrizzlyError: GrizzlySeries is not evaluated and does not have values. Try calling 'evaluate()' first.
+        weld.grizzly.core.error.GrizzlyError: GrizzlySeries is not evaluated and does not have values. Try calling 'evaluate()' first.
         """
         if not self.is_value:
             raise GrizzlyError("GrizzlySeries is not evaluated and does not have values. Try calling 'evaluate()' first.")
@@ -474,13 +234,13 @@ def __new__(cls, data, dtype=None, index=None, **kwargs):
         2    3
         dtype: int64
         >>> x.__class__
-        <class 'weld.grizzly.series.GrizzlySeries'>
+        <class 'weld.grizzly.core.series.GrizzlySeries'>
         >>> x = GrizzlySeries(np.ones(5))
         >>> x.__class__
-        <class 'weld.grizzly.series.GrizzlySeries'>
+        <class 'weld.grizzly.core.series.GrizzlySeries'>
         >>> y = GrizzlySeries(['hi', 'bye'])
         >>> y.__class__
-        <class 'weld.grizzly.series.GrizzlySeries'>
+        <class 'weld.grizzly.core.series.GrizzlySeries'>
         >>> y = GrizzlySeries([1, 2, 3], index=[1, 0, 2]) # Unsupported
         >>> y.__class__
         <class 'pandas.core.series.Series'>
diff --git a/weld-python/weld/grizzly/core/strings.py b/weld-python/weld/grizzly/core/strings.py
new file mode 100644
index 000000000..6db4162ee
--- /dev/null
+++ b/weld-python/weld/grizzly/core/strings.py
@@ -0,0 +1,263 @@
+"""
+String methods supported by Series.
+
+"""
+
+import weld.encoders.numpy as wenp
+import weld.grizzly.weld.str as weldstr
+
+from weld.types import *
+
+class StringMethods(object):
+    """
+    String methods for Grizzly. Currently, string methods only apply to ASCII
+    strings; while users can pass UTF-8 strings into Grizzly, their codepoints
+    will be ignored by the below operations and will be returned unmodified.
+
+    """
+
+    __slots__ = [ "series", "constructor" ]
+
+    def __init__(self, series):
+        if series.dtype.char != 'S':
+            raise ValueError("StringMethods only available for Series with dtype 'S'")
+        self.series = series
+        # TODO(shoumik): This is a hack: we should define an abstract class that captures
+        # the interface additional functionality needs.
+        self.constructor = self.series.__class__
+
+    def to_pandas(self):
+        """
+        Convert an array of strings to a Pandas series.
+
+        We provide a specialized implementation of `to_pandas` here that will perform UTF-8 decoding
+        of the raw bytestrings that Grizzly series operate over.
+
+        Examples
+        --------
+        >>> from weld.grizzly import GrizzlySeries
+        >>> x = GrizzlySeries(["Welcome", "to", "Grizzly!"])
+        >>> x
+        0     b'Welcome'
+        1          b'to'
+        2    b'Grizzly!'
+        dtype: bytes64
+        >>> x.str.to_pandas()
+        0     Welcome
+        1          to
+        2    Grizzly!
+        dtype: object
+
+        """
+        return self.series.evaluate().to_pandas().str.decode("utf-8")
+
+    def _apply(self, func, *args, return_weld_elem_type=None):
+        """
+        Apply the given weldfunc to `self.series` and return a new GrizzlySeries.
+
+        If the return type of the result is not a string GrizzlySeries, pass
+        'return_weld_elem_type' to specify the element type of the result.
+
+        """
+        output_type = self.series.output_type if return_weld_elem_type is None else WeldVec(return_weld_elem_type)
+        dtype = 'S' if return_weld_elem_type is None else wenp.weld_type_to_dtype(return_weld_elem_type)
+        lazy = func(self.series.weld_value_, *args)(output_type, self.constructor._decoder)
+        return (self.constructor)(lazy, dtype=dtype)
+
+    def lower(self):
+        """
+        Lowercase strings.
+
+        Examples
+        --------
+        >>> from weld.grizzly import GrizzlySeries
+        >>> x = GrizzlySeries(["HELLO", "WorLD"])
+        >>> x.str.lower().str.to_pandas()
+        0    hello
+        1    world
+        dtype: object
+
+        """
+        return self._apply(weldstr.lower)
+
+    def upper(self):
+        """
+        Uppercase strings.
+
+        Examples
+        --------
+        >>> from weld.grizzly import GrizzlySeries
+        >>> x = GrizzlySeries(["hello", "WorlD"])
+        >>> x.str.upper().str.to_pandas()
+        0    HELLO
+        1    WORLD
+        dtype: object
+
+        """
+        return self._apply(weldstr.upper)
+
+    def capitalize(self):
+        """
+        Capitalize the first character in each string.
+
+        Examples
+        --------
+        >>> from weld.grizzly import GrizzlySeries
+        >>> x = GrizzlySeries(["hello", "worlD"])
+        >>> x.str.capitalize().str.to_pandas()
+        0    Hello
+        1    World
+        dtype: object
+
+        """
+        return self._apply(weldstr.capitalize)
+
+    def get(self, index):
+        """
+        Get the character at index 'i' from each string. If 'index' is greater than
+        the string length, this returns an empty string. If 'index' is less than 0,
+        this wraps around, using Python's indexing behavior.
+
+        Examples
+        --------
+        >>> from weld.grizzly import GrizzlySeries
+        >>> x = GrizzlySeries(["hello", "worlD"])
+        >>> x.str.get(4).str.to_pandas()
+        0    o
+        1    D
+        dtype: object
+        >>> x.str.get(-3).str.to_pandas()
+        0    l
+        1    r
+        dtype: object
+
+        """
+        return self._apply(weldstr.get, index)
+
+    def strip(self):
+        """
+        Strip whitespace from the string.
+
+        Examples
+        --------
+        >>> from weld.grizzly import GrizzlySeries
+        >>> x = GrizzlySeries(["     hello   ", "   world    \t  "])
+        >>> x.str.strip().str.to_pandas()
+        0    hello
+        1    world
+        dtype: object
+
+        """
+        return self._apply(weldstr.strip)
+
+    def contains(self, pat):
+        """
+        Returns whether each string contains the provided pattern.
+
+        Pattern must be a Python string.
+
+        Examples
+        --------
+        >>> from weld.grizzly import GrizzlySeries
+        >>> x = GrizzlySeries(["hello", "world"])
+        >>> x.str.contains('wor').evaluate()
+        0    False
+        1     True
+        dtype: bool
+
+        """
+        if not isinstance(pat, str):
+            raise TypeError("pattern in contains must be a Python 'str'")
+        return self._apply(weldstr.contains, pat, return_weld_elem_type=Bool())
+
+    def startswith(self, pat):
+        """
+        Returns whether each string starts with the provided pattern.
+
+        Pattern must be a Python string.
+
+        Examples
+        --------
+        >>> from weld.grizzly import GrizzlySeries
+        >>> x = GrizzlySeries(["hello", "world"])
+        >>> x.str.startswith('wo').evaluate()
+        0    False
+        1     True
+        dtype: bool
+
+        """
+        if not isinstance(pat, str):
+            raise TypeError("pattern in startswith must be a Python 'str'")
+        return self._apply(weldstr.startswith, pat, return_weld_elem_type=Bool())
+
+    def endswith(self, pat):
+        """
+        Returns whether each string starts with the provided pattern.
+
+        Pattern must be a Python string.
+
+        Examples
+        --------
+        >>> from weld.grizzly import GrizzlySeries
+        >>> x = GrizzlySeries(["hello", "world"])
+        >>> x.str.endswith('rld').evaluate()
+        0    False
+        1     True
+        dtype: bool
+
+        """
+        if not isinstance(pat, str):
+            raise TypeError("pattern in endswith must be a Python 'str'")
+        return self._apply(weldstr.endswith, pat, return_weld_elem_type=Bool())
+
+    def find(self, sub, start=0, end=None):
+        """
+        Find 'sub' in each string. Each string is searched in the range [start,end].
+
+        'sub' must be a Python string, and 'start' and 'end' must be Python integers.
+
+        Examples
+        --------
+        >>> from weld.grizzly import GrizzlySeries
+        >>> x = GrizzlySeries(["bigfatcat", "fatcatbig", "reallybigcat"])
+        >>> x.str.find('fat').evaluate()
+        0    3
+        1    0
+        2   -1
+        dtype: int64
+        >>> x.str.find('big', end=2).evaluate()
+        0    0
+        1   -1
+        2   -1
+        dtype: int64
+
+        """
+        if not isinstance(sub, str):
+            raise TypeError("sub in find must be a Python 'str'")
+        if not isinstance(start, int):
+            raise TypeError("start in find must be a Python 'int'")
+        if end is not None and not isinstance(end, int):
+            raise TypeError("end in find must be a Python 'int'")
+        return self._apply(weldstr.find, sub, start, end, return_weld_elem_type=I64())
+
+    def replace(self, pat, rep):
+        """
+        Replaces the first occurrence of 'pat' with 'rep' in each string.
+
+        Pattern and replacement must be Python strings.
+
+        Examples
+        --------
+        >>> from weld.grizzly import GrizzlySeries
+        >>> x = GrizzlySeries(["hello", "world"])
+        >>> x.str.replace('o', 'lalala').str.to_pandas()
+        0    helllalala
+        1    wlalalarld
+        dtype: object
+
+        """
+        if not isinstance(pat, str):
+            raise TypeError("pattern in replace must be a Python 'str'")
+        if not isinstance(rep, str):
+            raise TypeError("replacement in replace must be a Python 'str'")
+        return self._apply(weldstr.replace, pat, rep)

From 59e4ccd41e8b7352c408c356d930511582a1fe41 Mon Sep 17 00:00:00 2001
From: Shoumik Palkar <shoumik@cs.stanford.edu>
Date: Sat, 21 Mar 2020 00:10:24 -0700
Subject: [PATCH 7/9] add tests for strings

---
 .../tests/grizzly/{ => core}/test_series.py   |   0
 .../tests/grizzly/core/test_strings.py        | 120 ++++++++++++++++++
 weld-python/weld/grizzly/core/strings.py      |   4 +-
 weld-python/weld/grizzly/weld/str.py          |   7 +-
 4 files changed, 125 insertions(+), 6 deletions(-)
 rename weld-python/tests/grizzly/{ => core}/test_series.py (100%)
 create mode 100644 weld-python/tests/grizzly/core/test_strings.py

diff --git a/weld-python/tests/grizzly/test_series.py b/weld-python/tests/grizzly/core/test_series.py
similarity index 100%
rename from weld-python/tests/grizzly/test_series.py
rename to weld-python/tests/grizzly/core/test_series.py
diff --git a/weld-python/tests/grizzly/core/test_strings.py b/weld-python/tests/grizzly/core/test_strings.py
new file mode 100644
index 000000000..cef4539cc
--- /dev/null
+++ b/weld-python/tests/grizzly/core/test_strings.py
@@ -0,0 +1,120 @@
+"""
+Test string functionality.
+
+The behavior is tested against Pandas unless noted otherwise.
+
+"""
+
+import numpy as np
+import pandas as pd
+import pytest
+import weld.grizzly as gr
+
+# To check whether the output is a string.
+# TODO(shoumik): There should be a better way to do this, another reason
+# to use ExtensionArray and a custom dtype for Weldified string arrays.
+from weld.types import WeldVec, I8
+
+def compare_vs_pandas(func, strings, *args, **kwargs):
+    pandas_series = pd.Series(strings)
+    grizzly_series = gr.GrizzlySeries(strings)
+
+    pandas_result = getattr(pandas_series.str, func)(*args, **kwargs)
+    grizzly_result = getattr(grizzly_series.str, func)(*args, **kwargs)
+    if grizzly_result.output_type.elem_type != WeldVec(I8()):
+        grizzly_result = grizzly_result.to_pandas()
+    else:
+        # Perform UTF-8 decoding.
+        grizzly_result = grizzly_result.str.to_pandas()
+    assert pandas_result.equals(grizzly_result)
+
+def test_to_pandas():
+    pass
+
+# Strings to test capitalization functions.
+capitals_strings = [
+        "hello",  "HELLO", "LonGHelLO", "",
+        "3.141592, it's pi!", "many words in this one"]
+
+def test_lower():
+    compare_vs_pandas('lower', capitals_strings)
+
+def test_upper():
+    compare_vs_pandas('upper', capitals_strings)
+
+def test_capitalize():
+    compare_vs_pandas('capitalize', capitals_strings)
+
+def test_get():
+    """
+    Behavior of get is different in Grizzly -- it currently returns empty strings
+    in cases where Pandas returns NaN. This will be changed in a later patch.
+
+    """
+    inp = ["hello", "world", "test", "me", '']
+    expect = ['l', 'l', 't', '', '']
+    grizzly_result = gr.GrizzlySeries(inp).str.get(3).str.to_pandas()
+    pandas_result = pd.Series(expect)
+    assert pandas_result.equals(grizzly_result)
+
+    expect = ['o', 'd', 't', 'e', '']
+    grizzly_result = gr.GrizzlySeries(inp).str.get(-1).str.to_pandas()
+    pandas_result = pd.Series(expect)
+    assert pandas_result.equals(grizzly_result)
+
+    expect = ['', '', '', '', '']
+    grizzly_result = gr.GrizzlySeries(inp).str.get(-50).str.to_pandas()
+    pandas_result = pd.Series(expect)
+    assert pandas_result.equals(grizzly_result)
+
+def test_strip():
+    compare_vs_pandas('strip', ["",
+    "   hi   ",
+    "\t\thi\n",
+    """
+
+    hello
+
+    """,
+    "    \t goodbye",
+    "goodbye again    ",
+    "   \n hi \n bye \n ",
+    """
+
+    hi
+
+    bye
+
+    """])
+
+def test_contains():
+    compare_vs_pandas('contains', ["abc", "abcdefg", "gfedcbaabcabcdef", ""], "abc")
+
+def test_startswith():
+    compare_vs_pandas('startswith', ["abc", "abcdefg", "gfedcba", "", "defabc"], "abc")
+
+def test_endswith():
+    compare_vs_pandas('endswith', ["abc", "abcdefg", "gfedabc", "", "defabc"], "abc")
+
+def test_find():
+    compare_vs_pandas('find', ["abc", "abcdefg", "gfedcbaabcabcdef", ""], "abc")
+    compare_vs_pandas('find', ["abc", "abcdefg", "gfedcbaabcabcdef", ""], "abc", 2)
+    compare_vs_pandas('find', ["abc", "abcdefg", "gfedcbaabcabcdef", ""], "abc", 3)
+    compare_vs_pandas('find', ["abc", "abcdefg", "gfedcbaabcabcdef", ""], "abc", end=2)
+    compare_vs_pandas('find', ["abc", "abcdefg", "gfedcbaabcabcdef", ""], "abc", end=3)
+    compare_vs_pandas('find', ["abc", "abcdefg", "gfedcbaabcabcdef", ""], "abc", 3, end=7)
+    compare_vs_pandas('find', ["abc", "abcdefg", "gfedcbaabcabcdef", ""], "abc", 100, end=105)
+
+def test_replace():
+    """
+    Behavior of replace is different in Grizzly -- it currently only replaces the *first*
+    occurrance. This will be changed in a later patch.
+
+    """
+    import copy
+    inp = ["abc", "abcdefg", "abcabcabc", "gfedcbaabcabcdef", "", "XYZ"]
+    expect = [s.replace("abc", "XYZ", 1) for s in copy.copy(inp)]
+    grizzly_result = gr.GrizzlySeries(inp).str.replace("abc", "XYZ").str.to_pandas()
+    pandas_result = pd.Series(expect)
+    assert pandas_result.equals(grizzly_result)
+
diff --git a/weld-python/weld/grizzly/core/strings.py b/weld-python/weld/grizzly/core/strings.py
index 6db4162ee..1616e5fc1 100644
--- a/weld-python/weld/grizzly/core/strings.py
+++ b/weld-python/weld/grizzly/core/strings.py
@@ -212,7 +212,7 @@ def endswith(self, pat):
 
     def find(self, sub, start=0, end=None):
         """
-        Find 'sub' in each string. Each string is searched in the range [start,end].
+        Find 'sub' in each string. Each string is searched in the range [start,end).
 
         'sub' must be a Python string, and 'start' and 'end' must be Python integers.
 
@@ -226,7 +226,7 @@ def find(self, sub, start=0, end=None):
         2   -1
         dtype: int64
         >>> x.str.find('big', end=2).evaluate()
-        0    0
+        0   -1
         1   -1
         2   -1
         dtype: int64
diff --git a/weld-python/weld/grizzly/weld/str.py b/weld-python/weld/grizzly/weld/str.py
index 0565d0052..7f7622a3d 100644
--- a/weld-python/weld/grizzly/weld/str.py
+++ b/weld-python/weld/grizzly/weld/str.py
@@ -258,8 +258,7 @@ def find(stringarr, sub, start, end=None):
 
     start = "i64({})".format(start)
     if end is None:
-        # -1L since we add 1L below to make the range inclusive in the size.
-        end = 'len(e) - 1L'
+        end = 'len(e)'
     else:
         end = "i64({})".format(end)
 
@@ -271,8 +270,8 @@ def find(stringarr, sub, start, end=None):
     map({stringarr},
         |e: vec[i8]|
             let start = {start};
-            let size = {end} - start + 1L; # Search [start:end], so add 1L.
-            if (start < 0L || start + size > len(e),
+            let size = {end} - start;
+            if (start < 0L,
                 -1L,
                 let string = slice(e, start, size);
                 let lenString = len(string);

From c8fb7873595978d40567f5b7a575f64786d12eed Mon Sep 17 00:00:00 2001
From: Shoumik Palkar <shoumik@cs.stanford.edu>
Date: Sat, 21 Mar 2020 00:10:58 -0700
Subject: [PATCH 8/9] remove some dead code

---
 weld-python/tests/grizzly/core/test_strings.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/weld-python/tests/grizzly/core/test_strings.py b/weld-python/tests/grizzly/core/test_strings.py
index cef4539cc..e760c1cd0 100644
--- a/weld-python/tests/grizzly/core/test_strings.py
+++ b/weld-python/tests/grizzly/core/test_strings.py
@@ -28,9 +28,6 @@ def compare_vs_pandas(func, strings, *args, **kwargs):
         grizzly_result = grizzly_result.str.to_pandas()
     assert pandas_result.equals(grizzly_result)
 
-def test_to_pandas():
-    pass
-
 # Strings to test capitalization functions.
 capitals_strings = [
         "hello",  "HELLO", "LonGHelLO", "",

From 311d3143028aea7a32eaa77a053cd691b313241a Mon Sep 17 00:00:00 2001
From: Shoumik Palkar <shoumik@cs.stanford.edu>
Date: Sat, 21 Mar 2020 09:42:37 -0700
Subject: [PATCH 9/9] add conftest

---
 weld-python/weld/conftest.py             | 15 +++++++++++
 weld-python/weld/grizzly/core/strings.py | 33 ++++++++----------------
 2 files changed, 26 insertions(+), 22 deletions(-)
 create mode 100644 weld-python/weld/conftest.py

diff --git a/weld-python/weld/conftest.py b/weld-python/weld/conftest.py
new file mode 100644
index 000000000..acbd85af3
--- /dev/null
+++ b/weld-python/weld/conftest.py
@@ -0,0 +1,15 @@
+
+import pytest
+
+import numpy as np
+import pandas as pd
+import weld.grizzly as gr
+
+@pytest.fixture(autouse=True)
+def add_imports(doctest_namespace):
+    """
+    Make `gr`, `np`,  and `pd available for doctests.
+    """
+    doctest_namespace["np"] = np
+    doctest_namespace["pd"] = pd
+    doctest_namespace["gr"] = gr
diff --git a/weld-python/weld/grizzly/core/strings.py b/weld-python/weld/grizzly/core/strings.py
index 1616e5fc1..e603a8cc6 100644
--- a/weld-python/weld/grizzly/core/strings.py
+++ b/weld-python/weld/grizzly/core/strings.py
@@ -35,8 +35,7 @@ def to_pandas(self):
 
         Examples
         --------
-        >>> from weld.grizzly import GrizzlySeries
-        >>> x = GrizzlySeries(["Welcome", "to", "Grizzly!"])
+        >>> x = gr.GrizzlySeries(["Welcome", "to", "Grizzly!"])
         >>> x
         0     b'Welcome'
         1          b'to'
@@ -70,8 +69,7 @@ def lower(self):
 
         Examples
         --------
-        >>> from weld.grizzly import GrizzlySeries
-        >>> x = GrizzlySeries(["HELLO", "WorLD"])
+        >>> x = gr.GrizzlySeries(["HELLO", "WorLD"])
         >>> x.str.lower().str.to_pandas()
         0    hello
         1    world
@@ -86,8 +84,7 @@ def upper(self):
 
         Examples
         --------
-        >>> from weld.grizzly import GrizzlySeries
-        >>> x = GrizzlySeries(["hello", "WorlD"])
+        >>> x = gr.GrizzlySeries(["hello", "WorlD"])
         >>> x.str.upper().str.to_pandas()
         0    HELLO
         1    WORLD
@@ -102,8 +99,7 @@ def capitalize(self):
 
         Examples
         --------
-        >>> from weld.grizzly import GrizzlySeries
-        >>> x = GrizzlySeries(["hello", "worlD"])
+        >>> x = gr.GrizzlySeries(["hello", "worlD"])
         >>> x.str.capitalize().str.to_pandas()
         0    Hello
         1    World
@@ -120,8 +116,7 @@ def get(self, index):
 
         Examples
         --------
-        >>> from weld.grizzly import GrizzlySeries
-        >>> x = GrizzlySeries(["hello", "worlD"])
+        >>> x = gr.GrizzlySeries(["hello", "worlD"])
         >>> x.str.get(4).str.to_pandas()
         0    o
         1    D
@@ -140,8 +135,7 @@ def strip(self):
 
         Examples
         --------
-        >>> from weld.grizzly import GrizzlySeries
-        >>> x = GrizzlySeries(["     hello   ", "   world    \t  "])
+        >>> x = gr.GrizzlySeries(["     hello   ", "   world    \t  "])
         >>> x.str.strip().str.to_pandas()
         0    hello
         1    world
@@ -158,8 +152,7 @@ def contains(self, pat):
 
         Examples
         --------
-        >>> from weld.grizzly import GrizzlySeries
-        >>> x = GrizzlySeries(["hello", "world"])
+        >>> x = gr.GrizzlySeries(["hello", "world"])
         >>> x.str.contains('wor').evaluate()
         0    False
         1     True
@@ -178,8 +171,7 @@ def startswith(self, pat):
 
         Examples
         --------
-        >>> from weld.grizzly import GrizzlySeries
-        >>> x = GrizzlySeries(["hello", "world"])
+        >>> x = gr.GrizzlySeries(["hello", "world"])
         >>> x.str.startswith('wo').evaluate()
         0    False
         1     True
@@ -198,8 +190,7 @@ def endswith(self, pat):
 
         Examples
         --------
-        >>> from weld.grizzly import GrizzlySeries
-        >>> x = GrizzlySeries(["hello", "world"])
+        >>> x = gr.GrizzlySeries(["hello", "world"])
         >>> x.str.endswith('rld').evaluate()
         0    False
         1     True
@@ -218,8 +209,7 @@ def find(self, sub, start=0, end=None):
 
         Examples
         --------
-        >>> from weld.grizzly import GrizzlySeries
-        >>> x = GrizzlySeries(["bigfatcat", "fatcatbig", "reallybigcat"])
+        >>> x = gr.GrizzlySeries(["bigfatcat", "fatcatbig", "reallybigcat"])
         >>> x.str.find('fat').evaluate()
         0    3
         1    0
@@ -248,8 +238,7 @@ def replace(self, pat, rep):
 
         Examples
         --------
-        >>> from weld.grizzly import GrizzlySeries
-        >>> x = GrizzlySeries(["hello", "world"])
+        >>> x = gr.GrizzlySeries(["hello", "world"])
         >>> x.str.replace('o', 'lalala').str.to_pandas()
         0    helllalala
         1    wlalalarld