weld-project · sppalkia · Mar 21, 2020 · Mar 18, 2020 · Mar 18, 2020 · Mar 19, 2020
diff --git a/weld-python/tests/grizzly/test_series.py → ...-python/tests/grizzly/core/test_series.py b/weld-python/tests/grizzly/test_series.py → ...-python/tests/grizzly/core/test_series.py
@@ -6,7 +6,7 @@
 import numpy as np
 import pandas as pd
 import pytest
-import weld.grizzly.series as gr
+import weld.grizzly as gr
 
 types_ = ['int8', 'uint8', 'int16', 'uint16', 'int32',\
         'uint32', 'int64', 'uint64', 'float32', 'float64']
@@ -134,7 +134,7 @@ def test_indexing():
 
 def test_unsupported_binop_error():
     # Test unsupported
-    from weld.grizzly.error import GrizzlyError
+    from weld.grizzly.core.error import GrizzlyError
     with pytest.raises(GrizzlyError):
         a = gr.GrizzlySeries([1,2,3])
         b = pd.Series([1,2,3])

diff --git a/weld-python/tests/grizzly/core/test_strings.py b/weld-python/tests/grizzly/core/test_strings.py
@@ -0,0 +1,117 @@
+"""
+Test string functionality.
+
+The behavior is tested against Pandas unless noted otherwise.
+
+"""
+
+import numpy as np
+import pandas as pd
+import pytest
+import weld.grizzly as gr
+
+# To check whether the output is a string.
+# TODO(shoumik): There should be a better way to do this, another reason
+# to use ExtensionArray and a custom dtype for Weldified string arrays.
+from weld.types import WeldVec, I8
+
+def compare_vs_pandas(func, strings, *args, **kwargs):
+    pandas_series = pd.Series(strings)
+    grizzly_series = gr.GrizzlySeries(strings)
+
+    pandas_result = getattr(pandas_series.str, func)(*args, **kwargs)
+    grizzly_result = getattr(grizzly_series.str, func)(*args, **kwargs)
+    if grizzly_result.output_type.elem_type != WeldVec(I8()):
+        grizzly_result = grizzly_result.to_pandas()
+    else:
+        # Perform UTF-8 decoding.
+        grizzly_result = grizzly_result.str.to_pandas()
+    assert pandas_result.equals(grizzly_result)
+
+# Strings to test capitalization functions.
+capitals_strings = [
+        "hello",  "HELLO", "LonGHelLO", "",
+        "3.141592, it's pi!", "many words in this one"]
+
+def test_lower():
+    compare_vs_pandas('lower', capitals_strings)
+
+def test_upper():
+    compare_vs_pandas('upper', capitals_strings)
+
+def test_capitalize():
+    compare_vs_pandas('capitalize', capitals_strings)
+
+def test_get():
+    """
+    Behavior of get is different in Grizzly -- it currently returns empty strings
+    in cases where Pandas returns NaN. This will be changed in a later patch.
+
+    """
+    inp = ["hello", "world", "test", "me", '']
+    expect = ['l', 'l', 't', '', '']
+    grizzly_result = gr.GrizzlySeries(inp).str.get(3).str.to_pandas()
+    pandas_result = pd.Series(expect)
+    assert pandas_result.equals(grizzly_result)
+
+    expect = ['o', 'd', 't', 'e', '']
+    grizzly_result = gr.GrizzlySeries(inp).str.get(-1).str.to_pandas()
+    pandas_result = pd.Series(expect)
+    assert pandas_result.equals(grizzly_result)
+
+    expect = ['', '', '', '', '']
+    grizzly_result = gr.GrizzlySeries(inp).str.get(-50).str.to_pandas()
+    pandas_result = pd.Series(expect)
+    assert pandas_result.equals(grizzly_result)
+
+def test_strip():
+    compare_vs_pandas('strip', ["",
+    "   hi   ",
+    "\t\thi\n",
+    """
+
+    hello
+
+    """,
+    "    \t goodbye",
+    "goodbye again    ",
+    "   \n hi \n bye \n ",
+    """
+
+    hi
+
+    bye
+
+    """])
+
+def test_contains():
+    compare_vs_pandas('contains', ["abc", "abcdefg", "gfedcbaabcabcdef", ""], "abc")
+
+def test_startswith():
+    compare_vs_pandas('startswith', ["abc", "abcdefg", "gfedcba", "", "defabc"], "abc")
+
+def test_endswith():
+    compare_vs_pandas('endswith', ["abc", "abcdefg", "gfedabc", "", "defabc"], "abc")
+
+def test_find():
+    compare_vs_pandas('find', ["abc", "abcdefg", "gfedcbaabcabcdef", ""], "abc")
+    compare_vs_pandas('find', ["abc", "abcdefg", "gfedcbaabcabcdef", ""], "abc", 2)
+    compare_vs_pandas('find', ["abc", "abcdefg", "gfedcbaabcabcdef", ""], "abc", 3)
+    compare_vs_pandas('find', ["abc", "abcdefg", "gfedcbaabcabcdef", ""], "abc", end=2)
+    compare_vs_pandas('find', ["abc", "abcdefg", "gfedcbaabcabcdef", ""], "abc", end=3)
+    compare_vs_pandas('find', ["abc", "abcdefg", "gfedcbaabcabcdef", ""], "abc", 3, end=7)
+    compare_vs_pandas('find', ["abc", "abcdefg", "gfedcbaabcabcdef", ""], "abc", 100, end=105)
+
+def test_replace():
+    """
+    Behavior of replace is different in Grizzly -- it currently only replaces the *first*
+    occurrance. This will be changed in a later patch.
+
+    """
+    import copy
+    inp = ["abc", "abcdefg", "abcabcabc", "gfedcbaabcabcdef", "", "XYZ"]
+    expect = [s.replace("abc", "XYZ", 1) for s in copy.copy(inp)]
+    grizzly_result = gr.GrizzlySeries(inp).str.replace("abc", "XYZ").str.to_pandas()
+    pandas_result = pd.Series(expect)
+    assert pandas_result.equals(grizzly_result)
+
diff --git a/weld-python/weld/compile.py b/weld-python/weld/compile.py
@@ -81,7 +81,7 @@ def compile(program, arg_types, encoders, restype, decoder, conf=None):
     >>> from weld.types import *
     >>> func = compile("|x: i32| x + 1",
     ...        [I32()],  [None],
-    ...        I32(), None) 
+    ...        I32(), None)
     ...
     >>> func(100)[0]
     101
@@ -92,7 +92,7 @@ def compile(program, arg_types, encoders, restype, decoder, conf=None):
 
     >>> func = compile("|x: i32, y: i32| x + y",
     ...        [I32(), I32()],  [None, None],
-    ...        I32(), None) 
+    ...        I32(), None)
     ...
     >>> func(5, 6)[0]
     11
@@ -101,7 +101,7 @@ def compile(program, arg_types, encoders, restype, decoder, conf=None):
 
     >>> func = compile("|x: i32| x + 1",
     ...        [I32()],  [PrimitiveWeldEncoder()],
-    ...        I32(), PrimitiveWeldDecoder()) 
+    ...        I32(), PrimitiveWeldDecoder())
     ...
     >>> func(100)[0]
     101
@@ -158,6 +158,7 @@ def func(*args, context=None):
         raw_args_pointer = ctypes.addressof(raw_args)
         value = WeldValue(raw_args_pointer)
 
+
         if context is None:
             context = WeldContext(conf)
 

diff --git a/weld-python/weld/conftest.py b/weld-python/weld/conftest.py
@@ -0,0 +1,15 @@
+
+import pytest
+
+import numpy as np
+import pandas as pd
+import weld.grizzly as gr
+
+@pytest.fixture(autouse=True)
+def add_imports(doctest_namespace):
+    """
+    Make `gr`, `np`,  and `pd available for doctests.
+    """
+    doctest_namespace["np"] = np
+    doctest_namespace["pd"] = pd
+    doctest_namespace["gr"] = gr
diff --git a/weld-python/weld/encoders/numpy.py b/weld-python/weld/encoders/numpy.py
@@ -268,7 +268,7 @@ class StringConversionFuncs(object):
     """
 
     stringfuncs = ctypes.PyDLL(weld.encoders._strings.__file__)
-    string_cclass = WeldVec(I8()).ctype_class
+    string_cclass = WeldVec(WeldVec(I8())).ctype_class
 
     @staticmethod
     def numpy_string_array_to_weld(arr):

diff --git a/weld-python/weld/grizzly/__init__.py b/weld-python/weld/grizzly/__init__.py
@@ -0,0 +1,2 @@
+
+from weld.grizzly.core.series import GrizzlySeries
diff --git a/weld-python/weld/grizzly/core/__init__.py b/weld-python/weld/grizzly/core/__init__.py
@@ -0,0 +1 @@
+
diff --git a/weld-python/weld/grizzly/error.py → weld-python/weld/grizzly/core/error.py b/weld-python/weld/grizzly/error.py → weld-python/weld/grizzly/core/error.py
diff --git a/weld-python/weld/grizzly/series.py → weld-python/weld/grizzly/core/series.py b/weld-python/weld/grizzly/series.py → weld-python/weld/grizzly/core/series.py
@@ -8,19 +8,18 @@
 import warnings
 
 import weld.encoders.numpy as wenp
+import weld.grizzly.weld.str as weldstr
 
-from pandas.core.internals import SingleBlockManager
 from weld.lazy import PhysicalValue, WeldLazy, WeldNode, identity
+from weld.grizzly.weld.ops import *
+from weld.grizzly.core.error import GrizzlyError
+from weld.grizzly.core.strings import StringMethods
 from weld.types import *
 
-from .weld.ops import *
-from .error import *
-
 def _grizzlyseries_constructor_with_fallback(data=None, **kwargs):
     """
     A flexible constructor for Series._constructor, which needs to be able
-    to fall back to a Series (if a certain operation does not produce
-    geometries)
+    to fall back to a Series (if a certain operation cannot produce GrizzlySeries).
     """
     try:
         return GrizzlySeries(data=data, **kwargs)
@@ -114,7 +113,7 @@ def values(self):
         >>> x.values
         Traceback (most recent call last):
         ...
-        weld.grizzly.error.GrizzlyError: GrizzlySeries is not evaluated and does not have values. Try calling 'evaluate()' first.
+        weld.grizzly.core.error.GrizzlyError: GrizzlySeries is not evaluated and does not have values. Try calling 'evaluate()' first.
         """
         if not self.is_value:
             raise GrizzlyError("GrizzlySeries is not evaluated and does not have values. Try calling 'evaluate()' first.")
@@ -212,7 +211,10 @@ def _supports_grizzly(cls, data):
         """
         if not isinstance(data, np.ndarray) or data.ndim != 1:
             return None
-        elem_type = wenp.dtype_to_weld_type(data.dtype)
+        if data.dtype.char == 'S':
+            elem_type = WeldVec(I8())
+        else:
+            elem_type = wenp.dtype_to_weld_type(data.dtype)
         return WeldVec(elem_type) if elem_type is not None else None
 
     # ---------------------- Initialization ------------------------------
@@ -232,34 +234,44 @@ def __new__(cls, data, dtype=None, index=None, **kwargs):
         2    3
         dtype: int64
         >>> x.__class__
-        <class 'weld.grizzly.series.GrizzlySeries'>
+        <class 'weld.grizzly.core.series.GrizzlySeries'>
         >>> x = GrizzlySeries(np.ones(5))
         >>> x.__class__
-        <class 'weld.grizzly.series.GrizzlySeries'>
-        >>> y = GrizzlySeries(['hi', 'bye']) # Unsupported
+        <class 'weld.grizzly.core.series.GrizzlySeries'>
+        >>> y = GrizzlySeries(['hi', 'bye'])
         >>> y.__class__
-        <class 'pandas.core.series.Series'>
+        <class 'weld.grizzly.core.series.GrizzlySeries'>
         >>> y = GrizzlySeries([1, 2, 3], index=[1, 0, 2]) # Unsupported
         >>> y.__class__
         <class 'pandas.core.series.Series'>
         """
         s = None
         if isinstance(data, WeldLazy):
             self = super(GrizzlySeries, cls).__new__(cls)
-            super(GrizzlySeries, self).__init__(None, dtype=dtype, **kwargs)
+            super(GrizzlySeries, self).__init__(np.array([], dtype=dtype), **kwargs)
             self.weld_value_ = data
             return self
-        elif index is not None and not isinstance(index, pd.RangeIndex):
+
+        if index is not None and not isinstance(index, pd.RangeIndex):
             # TODO(shoumik): This is probably incomplete, since we could have a
             # RangeIndex that does not capture the full span of the data, has a
             # non-zero step, etc.
             return pd.Series(data, dtype=dtype, index=index, **kwargs)
-        elif len(kwargs) != 0:
+
+        if len(kwargs) != 0:
+            # Unsupported arguments present: bail for now.
             return pd.Series(data, dtype=dtype, index=index, **kwargs)
+
+        if isinstance(data, list) and len(data) > 0 and isinstance(data[0], str):
+            # Try to convert a list of strings into a supported Numpy array.
+            data = np.array(data, dtype='S')
+
+        if isinstance(data, pd.Series):
+            data = data.values
         elif not isinstance(data, np.ndarray):
             # First, convert the input into a Series backed by an ndarray.
-            s = pd.Series(data, dtype=dtype, index=index, **kwargs)
-            data = s.values
+             s = pd.Series(data, dtype=dtype, index=index, **kwargs)
+             data = s.values
 
         # Try to create a Weld type for the input.
         weld_type = GrizzlySeries._supports_grizzly(data)
@@ -270,9 +282,17 @@ def __new__(cls, data, dtype=None, index=None, **kwargs):
                     PhysicalValue(data, weld_type, GrizzlySeries._encoder),
                     GrizzlySeries._decoder)
             return self
+
         # Don't re-convert values if we did it once already -- it's expensive.
         return s if s is not None else pd.Series(data, dtype=dtype, index=index, **kwargs)
 
+    # ---------------------- StringMethods ------------------------------
+
+    @property
+    def str(self):
+        # TODO(shoumik.palkar): Use pandas.core.accessor.CachedAccessor?
+        return StringMethods(self)
+
     # ---------------------- Indexing ------------------------------
 
     def __setitem__(self, key, value):
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@

		from weld.grizzly.core.series import GrizzlySeries