Skip to content

Commit

Permalink
[WIP] Refactor pd grouped (#177)
Browse files Browse the repository at this point in the history
* refactor: clean up grouped pandas translations

* chore: refactor series spec and corresponding tests

* tests: refactor series spec tests, add ones for summarize

* feat: mock_sqlalchemy_engine for showing sql

* script for running and dumping outputs of methods

* fix utils example
  • Loading branch information
machow authored Jan 25, 2020
1 parent f0c3606 commit f825e74
Show file tree
Hide file tree
Showing 7 changed files with 531 additions and 199 deletions.
236 changes: 236 additions & 0 deletions docs/generate_impl_table.Rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,236 @@
---
jupyter:
jupytext:
text_representation:
extension: .Rmd
format_name: rmarkdown
format_version: '1.2'
jupytext_version: 1.3.0
kernelspec:
display_name: Python 3
language: python
name: python3
---

```{python}
from siuba.spec.series import spec, nested_spec
from tabulate import tabulate
```

```{python}
from siuba.siu import ALL_OPS
```

```{python}
from black import format_str, FileMode
from IPython.core.interactiveshell import InteractiveShell
from IPython.display import HTML
from pygments import highlight
from pygments.lexers import PythonLexer
from pygments.formatters import HtmlFormatter
from IPython.utils.capture import capture_output
EXAMPLE_TEMPLATE = """
import pandas as pd
from siuba import _, {verb}
data = pd.DataFrame({data})
{verb}(data, result = {call_expr})
"""
EXAMPLE_TEMPLATE2 = """
from siuba import _, show_query, {verb}
from siuba.sql import LazyTbl
from siuba.sql.utils import mock_sqlalchemy_engine
engine = mock_sqlalchemy_engine("postgresql")
tbl = LazyTbl(engine, 'some_table', ['g', 'x', 'y'])
query = tbl >> {verb}(result = {call_expr}) >> show_query()
"""
EXAMPLE_TEMPLATE3 = """
from siuba import group_by
query = tbl >> group_by(_.g) >> {verb}(result = {call_expr}) >> show_query()
"""
def load_template(template, data, verb, call_expr):
loaded_str = template.format(
data = data,
verb = verb,
call_expr = str(entry['expr_frame'])
)
mode = FileMode()
pretty_code = format_str(loaded_str, mode = mode)
return pretty_code
def run_to_html_payload(name, code, shell):
with capture_output() as c:
res = shell.run_cell(code).result
if isinstance(res, pd.DataFrame):
output = res.to_html()
else:
output = "<pre>" + str(c) + "</pre>"
code_html = highlight(code, PythonLexer(), HtmlFormatter(prestyles = "text-align: left;"))
return {'name': name, 'input': code_html, 'output': output, "printed": str(c)}
def create_code(entry, data, shell):
df = get_data(entry, data)
if entry['accessor'] == "dt":
df_repr = """
{'g': ['a', 'a', 'b', 'b'],
'x': pd.to_datetime(["2019-01-01 01:01:01", "2020-04-08 02:02:02","2021-07-15 03:03:03", "2022-10-22 04:04:04"])
}
"""
else:
df_repr = repr(df.to_dict(orient = "list"))
verb = "summarize" if entry['result']['type'] == "Agg" else "mutate"
call_expr = str(entry['expr_frame'])
examples = []
pretty_code = load_template(EXAMPLE_TEMPLATE, df_repr, verb, call_expr)
examples.append(
run_to_html_payload('Pandas DataFrame', pretty_code, shell)
)
if entry['result'].get('postgresql') not in {"xfail", "not_impl"}:
pretty_code2 = load_template(EXAMPLE_TEMPLATE2, df_repr, verb, call_expr)
examples.append(
run_to_html_payload('SQL Table', pretty_code2, shell)
)
pretty_code3 = load_template(EXAMPLE_TEMPLATE3, df_repr, verb, call_expr)
examples.append(
run_to_html_payload('Grouped SQL Table', pretty_code3, shell)
)
return examples
```

```{python}
from siuba.tests.test_dply_series_methods import get_data, DATA
STATUS = {'done':'✅', 'xfail': '🚧', 'not_impl': '❌'}
shell = InteractiveShell()
table = []
for name, entry in spec.items():
# notes
notes = []
sql_type = entry['result'].get('sql_type')
if sql_type:
notes.append("SQL returns a %s."%sql_type)
# postgres stuff
no_mutate = entry['result'].get('no_mutate')
if no_mutate:
notes.append("Cannot be used in a mutate with %s"%",".join(no_mutate))
postgresql = entry['result'].get('postgresql', 'done')
# example
example_data = get_data(entry, DATA)
entry_type = entry.get('type')
examples = create_code(entry, DATA, shell)
shell.reset()
table.append({
'name': name,
'category': entry['category'],
'data_arity': entry['data_arity'],
'type': entry['result'].get('type'),
'pandas': STATUS['done'],
'postgresql': STATUS[postgresql],
'expr_frame': str(entry['expr_frame']),
'note': "\n".join(notes),
'examples': examples
})
```

```{python}
from airtable import Airtable
import pandas as pd
from siuba import filter, _, pipe
airtable = Airtable('appErTNqCFXn6stSH', 'methods')
res = airtable.get_all()
air_methods = pd.io.json.json_normalize(res)
air_methods.columns = air_methods.columns.map(lambda s: s.split('.')[-1])
air_methods.rename(columns = {'method_name': 'name'}, inplace = True)
```

```{python}
AIR_STATUS = {
'done':'✅',
'priority-zero': '',
'priority-low': '',
'priority-medium': '🚧',
'priority-high': '🚧',
'no support': '❌'
}
hidden_cats = {}#{'_special_methods', 'binary'}
final_table = pd.DataFrame([x for x in table if x['category'] not in hidden_cats])
small_air = air_methods.loc[:, ['category', 'support_category', 'name']]
small_air['fast grouped'] = small_air.support_category.map(AIR_STATUS).fillna('❌')
merged = small_air.merge(final_table.drop(columns = ['category']), how = "left", on = "name")
```

```{python}
from IPython.display import HTML
from qgrid import show_grid
# TODO:
# * missing pandas methods (crnt only includes those impl for group by)
# * filterable on backend and status
# * include method doc?
# * replace "type" with "result" category
#HTML(tabulate(final_table, headers = "keys", tablefmt = "html"))
cols_to_keep = [
"category", "name",
"fast grouped", "postgresql",
"note", "expr_frame", "support_category",
"examples"
]
final = (merged[cols_to_keep]
.fillna("")
.sort_values(["category", "name"])
[lambda d: d["fast grouped"] != ""]
)
HTML(tabulate(final, headers = "keys", tablefmt = "html"))
final.to_json('../docs/_static/support-table/data.json', orient = 'records')
```

## Create example method docs

```{python}
print(HtmlFormatter().get_style_defs('.highlight'))
```
6 changes: 6 additions & 0 deletions siuba/experimental/pd_groups/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,12 @@ def grouper_match(grp1, grp2):


def broadcast_group_elements(x, y):
"""Returns 3-tuple of same-length x and y data, plus a reference group by object.
Note:
* Raises error if x and y are not compatible group by objects.
* Will broadcast a GroupByAgg, to ensure same length as other data.
"""
if all_isinstance(GroupByAgg, x, y) and x._orig_grouper is y._orig_grouper:
return x.obj, y.obj, x

Expand Down
100 changes: 56 additions & 44 deletions siuba/experimental/pd_groups/translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,82 +2,94 @@
import pandas as pd


def is_literal(el):
# TODO: pandas has this function--should use that
return isinstance(el, (int, float, str))
# utilities -------------------------------------------------------------------

def _validate_data_args(x, **kwargs):
if not isinstance(x, SeriesGroupBy):
raise TypeError("First data argument must be a grouped Series object")

for name, other_data in kwargs.items():
if isinstance(other_data, pd.Series):
raise TypeError("{} may not be a Series.".format(name))


def _apply_grouped_method(ser, name, is_property, accessor, args, kwargs):
if accessor:
method = getattr(getattr(ser, accessor), name)
else:
method = getattr(ser, name)

res = method(*args, **kwargs) if not is_property else method

return res


def _maybe_broadcast(x, y):
"""Same as broadcast_group_elements, but y doesn't have to be SeriesGroupBy
This is important when y is a literal (e.g. 1), since we don't want to raise
an error, or broadcast 1 to the length of x. Rather, we want to keep the literal,
and let the pandas series handle it in the operation.
"""
if isinstance(y, SeriesGroupBy):
left, right, groupby = broadcast_group_elements(x, y)
else:
left, right, groupby = x.obj, y, x

return left, right, groupby


# Translations ----------------------------------------------------------------

def not_implemented(name, is_property, accessor):
return NotImplementedError


def method_agg_op(name, is_property, accessor):
def f(__ser, *args, **kwargs):
if not isinstance(__ser, SeriesGroupBy):
raise TypeError("All methods must operate on grouped Series objects")

method = getattr(__ser, name)
_validate_data_args(__ser)
res = _apply_grouped_method(__ser, name, is_property, accessor, args, kwargs)

res = method(*args, **kwargs)
return GroupByAgg.from_result(res, __ser)

f.__name__ = name
f.__qualname__ = name
f.__name__ = f.__qualname__ = name
return f


def method_el_op(name, is_property, accessor):
def f(__ser, *args, **kwargs):
if not isinstance(__ser, SeriesGroupBy):
raise TypeError("All methods must operate on a grouped Series objects")

if accessor:
method = getattr(getattr(__ser.obj, accessor), name)
else:
method = getattr(__ser.obj, name)
_validate_data_args(__ser)
res = _apply_grouped_method(__ser.obj, name, is_property, accessor, args, kwargs)

res = method(*args, **kwargs) if not is_property else method
return _regroup(res, __ser)

f.__name__ = name
f.__qualname__ = name
f.__name__ = f.__qualname__ = name
return f

def method_el_op2(name, **kwargs):

def method_el_op2(name, is_property, accessor):
def f(x, y):
if isinstance(x, pd.Series) or isinstance(y, pd.Series):
raise TypeError("No Series allowed")

elif isinstance(x, SeriesGroupBy) and isinstance(y, SeriesGroupBy):
left, right, groupby = broadcast_group_elements(x, y)
elif is_literal(x):
right, left, groupby = x, y.obj, y
elif is_literal(y):
left, right, groupby = x.obj, y, x
else:
raise TypeError("All methods must operate on a grouped Series objects")
_validate_data_args(x, y = y)
left, right, groupby = _maybe_broadcast(x, y)

op_function = getattr(left, name)

res = op_function(right)
return _regroup(res, groupby)

f.__name__ = name
f.__qualname__ = name
f.__name__ = f.__qualname__ = name
return f


def method_win_op(name, is_property, accessor):
def f(__ser, *args, **kwargs):
if not isinstance(__ser, SeriesGroupBy):
raise TypeError("All methods must operate on a grouped Series objects")

if accessor:
method = getattr(getattr(__ser, accessor), name)
else:
method = getattr(__ser, name)
_validate_data_args(__ser)
res = _apply_grouped_method(__ser, name, is_property, accessor, args, kwargs)

res = method(*args, **kwargs) if not is_property else method
return _regroup(res, __ser)

f.__name__ = name
f.__qualname__ = name
f.__name__ = f.__qualname__ = name
return f


Expand Down
Loading

0 comments on commit f825e74

Please sign in to comment.