Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add string functions #508

Merged
merged 9 commits into from
Mar 21, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import numpy as np
import pandas as pd
import pytest
import weld.grizzly.series as gr
import weld.grizzly as gr

types_ = ['int8', 'uint8', 'int16', 'uint16', 'int32',\
'uint32', 'int64', 'uint64', 'float32', 'float64']
Expand Down Expand Up @@ -134,7 +134,7 @@ def test_indexing():

def test_unsupported_binop_error():
# Test unsupported
from weld.grizzly.error import GrizzlyError
from weld.grizzly.core.error import GrizzlyError
with pytest.raises(GrizzlyError):
a = gr.GrizzlySeries([1,2,3])
b = pd.Series([1,2,3])
Expand Down
117 changes: 117 additions & 0 deletions weld-python/tests/grizzly/core/test_strings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
"""
Test string functionality.

The behavior is tested against Pandas unless noted otherwise.

"""

import numpy as np
import pandas as pd
import pytest
import weld.grizzly as gr

# To check whether the output is a string.
# TODO(shoumik): There should be a better way to do this, another reason
# to use ExtensionArray and a custom dtype for Weldified string arrays.
from weld.types import WeldVec, I8

def compare_vs_pandas(func, strings, *args, **kwargs):
pandas_series = pd.Series(strings)
grizzly_series = gr.GrizzlySeries(strings)

pandas_result = getattr(pandas_series.str, func)(*args, **kwargs)
grizzly_result = getattr(grizzly_series.str, func)(*args, **kwargs)
if grizzly_result.output_type.elem_type != WeldVec(I8()):
grizzly_result = grizzly_result.to_pandas()
else:
# Perform UTF-8 decoding.
grizzly_result = grizzly_result.str.to_pandas()
assert pandas_result.equals(grizzly_result)

# Strings to test capitalization functions.
capitals_strings = [
"hello", "HELLO", "LonGHelLO", "",
"3.141592, it's pi!", "many words in this one"]

def test_lower():
compare_vs_pandas('lower', capitals_strings)

def test_upper():
compare_vs_pandas('upper', capitals_strings)

def test_capitalize():
compare_vs_pandas('capitalize', capitals_strings)

def test_get():
"""
Behavior of get is different in Grizzly -- it currently returns empty strings
in cases where Pandas returns NaN. This will be changed in a later patch.

"""
inp = ["hello", "world", "test", "me", '']
expect = ['l', 'l', 't', '', '']
grizzly_result = gr.GrizzlySeries(inp).str.get(3).str.to_pandas()
pandas_result = pd.Series(expect)
assert pandas_result.equals(grizzly_result)

expect = ['o', 'd', 't', 'e', '']
grizzly_result = gr.GrizzlySeries(inp).str.get(-1).str.to_pandas()
pandas_result = pd.Series(expect)
assert pandas_result.equals(grizzly_result)

expect = ['', '', '', '', '']
grizzly_result = gr.GrizzlySeries(inp).str.get(-50).str.to_pandas()
pandas_result = pd.Series(expect)
assert pandas_result.equals(grizzly_result)

def test_strip():
compare_vs_pandas('strip', ["",
" hi ",
"\t\thi\n",
"""

hello

""",
" \t goodbye",
"goodbye again ",
" \n hi \n bye \n ",
"""

hi

bye

"""])

def test_contains():
compare_vs_pandas('contains', ["abc", "abcdefg", "gfedcbaabcabcdef", ""], "abc")

def test_startswith():
compare_vs_pandas('startswith', ["abc", "abcdefg", "gfedcba", "", "defabc"], "abc")

def test_endswith():
compare_vs_pandas('endswith', ["abc", "abcdefg", "gfedabc", "", "defabc"], "abc")

def test_find():
compare_vs_pandas('find', ["abc", "abcdefg", "gfedcbaabcabcdef", ""], "abc")
compare_vs_pandas('find', ["abc", "abcdefg", "gfedcbaabcabcdef", ""], "abc", 2)
compare_vs_pandas('find', ["abc", "abcdefg", "gfedcbaabcabcdef", ""], "abc", 3)
compare_vs_pandas('find', ["abc", "abcdefg", "gfedcbaabcabcdef", ""], "abc", end=2)
compare_vs_pandas('find', ["abc", "abcdefg", "gfedcbaabcabcdef", ""], "abc", end=3)
compare_vs_pandas('find', ["abc", "abcdefg", "gfedcbaabcabcdef", ""], "abc", 3, end=7)
compare_vs_pandas('find', ["abc", "abcdefg", "gfedcbaabcabcdef", ""], "abc", 100, end=105)

def test_replace():
"""
Behavior of replace is different in Grizzly -- it currently only replaces the *first*
occurrance. This will be changed in a later patch.

"""
import copy
inp = ["abc", "abcdefg", "abcabcabc", "gfedcbaabcabcdef", "", "XYZ"]
expect = [s.replace("abc", "XYZ", 1) for s in copy.copy(inp)]
grizzly_result = gr.GrizzlySeries(inp).str.replace("abc", "XYZ").str.to_pandas()
pandas_result = pd.Series(expect)
assert pandas_result.equals(grizzly_result)

7 changes: 4 additions & 3 deletions weld-python/weld/compile.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def compile(program, arg_types, encoders, restype, decoder, conf=None):
>>> from weld.types import *
>>> func = compile("|x: i32| x + 1",
... [I32()], [None],
... I32(), None)
... I32(), None)
...
>>> func(100)[0]
101
Expand All @@ -92,7 +92,7 @@ def compile(program, arg_types, encoders, restype, decoder, conf=None):

>>> func = compile("|x: i32, y: i32| x + y",
... [I32(), I32()], [None, None],
... I32(), None)
... I32(), None)
...
>>> func(5, 6)[0]
11
Expand All @@ -101,7 +101,7 @@ def compile(program, arg_types, encoders, restype, decoder, conf=None):

>>> func = compile("|x: i32| x + 1",
... [I32()], [PrimitiveWeldEncoder()],
... I32(), PrimitiveWeldDecoder())
... I32(), PrimitiveWeldDecoder())
...
>>> func(100)[0]
101
Expand Down Expand Up @@ -158,6 +158,7 @@ def func(*args, context=None):
raw_args_pointer = ctypes.addressof(raw_args)
value = WeldValue(raw_args_pointer)


if context is None:
context = WeldContext(conf)

Expand Down
15 changes: 15 additions & 0 deletions weld-python/weld/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@

import pytest

import numpy as np
import pandas as pd
import weld.grizzly as gr

@pytest.fixture(autouse=True)
def add_imports(doctest_namespace):
"""
Make `gr`, `np`, and `pd available for doctests.
"""
doctest_namespace["np"] = np
doctest_namespace["pd"] = pd
doctest_namespace["gr"] = gr
2 changes: 1 addition & 1 deletion weld-python/weld/encoders/numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,7 @@ class StringConversionFuncs(object):
"""

stringfuncs = ctypes.PyDLL(weld.encoders._strings.__file__)
string_cclass = WeldVec(I8()).ctype_class
string_cclass = WeldVec(WeldVec(I8())).ctype_class

@staticmethod
def numpy_string_array_to_weld(arr):
Expand Down
2 changes: 2 additions & 0 deletions weld-python/weld/grizzly/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@

from weld.grizzly.core.series import GrizzlySeries
1 change: 1 addition & 0 deletions weld-python/weld/grizzly/core/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,18 @@
import warnings

import weld.encoders.numpy as wenp
import weld.grizzly.weld.str as weldstr

from pandas.core.internals import SingleBlockManager
from weld.lazy import PhysicalValue, WeldLazy, WeldNode, identity
from weld.grizzly.weld.ops import *
from weld.grizzly.core.error import GrizzlyError
from weld.grizzly.core.strings import StringMethods
from weld.types import *

from .weld.ops import *
from .error import *

def _grizzlyseries_constructor_with_fallback(data=None, **kwargs):
"""
A flexible constructor for Series._constructor, which needs to be able
to fall back to a Series (if a certain operation does not produce
geometries)
to fall back to a Series (if a certain operation cannot produce GrizzlySeries).
"""
try:
return GrizzlySeries(data=data, **kwargs)
Expand Down Expand Up @@ -114,7 +113,7 @@ def values(self):
>>> x.values
Traceback (most recent call last):
...
weld.grizzly.error.GrizzlyError: GrizzlySeries is not evaluated and does not have values. Try calling 'evaluate()' first.
weld.grizzly.core.error.GrizzlyError: GrizzlySeries is not evaluated and does not have values. Try calling 'evaluate()' first.
"""
if not self.is_value:
raise GrizzlyError("GrizzlySeries is not evaluated and does not have values. Try calling 'evaluate()' first.")
Expand Down Expand Up @@ -212,7 +211,10 @@ def _supports_grizzly(cls, data):
"""
if not isinstance(data, np.ndarray) or data.ndim != 1:
return None
elem_type = wenp.dtype_to_weld_type(data.dtype)
if data.dtype.char == 'S':
elem_type = WeldVec(I8())
else:
elem_type = wenp.dtype_to_weld_type(data.dtype)
return WeldVec(elem_type) if elem_type is not None else None

# ---------------------- Initialization ------------------------------
Expand All @@ -232,34 +234,44 @@ def __new__(cls, data, dtype=None, index=None, **kwargs):
2 3
dtype: int64
>>> x.__class__
<class 'weld.grizzly.series.GrizzlySeries'>
<class 'weld.grizzly.core.series.GrizzlySeries'>
>>> x = GrizzlySeries(np.ones(5))
>>> x.__class__
<class 'weld.grizzly.series.GrizzlySeries'>
>>> y = GrizzlySeries(['hi', 'bye']) # Unsupported
<class 'weld.grizzly.core.series.GrizzlySeries'>
>>> y = GrizzlySeries(['hi', 'bye'])
>>> y.__class__
<class 'pandas.core.series.Series'>
<class 'weld.grizzly.core.series.GrizzlySeries'>
>>> y = GrizzlySeries([1, 2, 3], index=[1, 0, 2]) # Unsupported
>>> y.__class__
<class 'pandas.core.series.Series'>
"""
s = None
if isinstance(data, WeldLazy):
self = super(GrizzlySeries, cls).__new__(cls)
super(GrizzlySeries, self).__init__(None, dtype=dtype, **kwargs)
super(GrizzlySeries, self).__init__(np.array([], dtype=dtype), **kwargs)
self.weld_value_ = data
return self
elif index is not None and not isinstance(index, pd.RangeIndex):

if index is not None and not isinstance(index, pd.RangeIndex):
# TODO(shoumik): This is probably incomplete, since we could have a
# RangeIndex that does not capture the full span of the data, has a
# non-zero step, etc.
return pd.Series(data, dtype=dtype, index=index, **kwargs)
elif len(kwargs) != 0:

if len(kwargs) != 0:
# Unsupported arguments present: bail for now.
return pd.Series(data, dtype=dtype, index=index, **kwargs)

if isinstance(data, list) and len(data) > 0 and isinstance(data[0], str):
# Try to convert a list of strings into a supported Numpy array.
data = np.array(data, dtype='S')

if isinstance(data, pd.Series):
data = data.values
elif not isinstance(data, np.ndarray):
# First, convert the input into a Series backed by an ndarray.
s = pd.Series(data, dtype=dtype, index=index, **kwargs)
data = s.values
s = pd.Series(data, dtype=dtype, index=index, **kwargs)
data = s.values

# Try to create a Weld type for the input.
weld_type = GrizzlySeries._supports_grizzly(data)
Expand All @@ -270,9 +282,17 @@ def __new__(cls, data, dtype=None, index=None, **kwargs):
PhysicalValue(data, weld_type, GrizzlySeries._encoder),
GrizzlySeries._decoder)
return self

# Don't re-convert values if we did it once already -- it's expensive.
return s if s is not None else pd.Series(data, dtype=dtype, index=index, **kwargs)

# ---------------------- StringMethods ------------------------------

@property
def str(self):
# TODO(shoumik.palkar): Use pandas.core.accessor.CachedAccessor?
return StringMethods(self)

# ---------------------- Indexing ------------------------------

def __setitem__(self, key, value):
Expand Down
Loading