diff --git a/docs/generate_impl_table.Rmd b/docs/generate_impl_table.Rmd new file mode 100644 index 00000000..38718acd --- /dev/null +++ b/docs/generate_impl_table.Rmd @@ -0,0 +1,236 @@ +--- +jupyter: + jupytext: + text_representation: + extension: .Rmd + format_name: rmarkdown + format_version: '1.2' + jupytext_version: 1.3.0 + kernelspec: + display_name: Python 3 + language: python + name: python3 +--- + +```{python} +from siuba.spec.series import spec, nested_spec +from tabulate import tabulate +``` + +```{python} +from siuba.siu import ALL_OPS +``` + +```{python} +from black import format_str, FileMode +from IPython.core.interactiveshell import InteractiveShell +from IPython.display import HTML + +from pygments import highlight +from pygments.lexers import PythonLexer +from pygments.formatters import HtmlFormatter + +from IPython.utils.capture import capture_output + +EXAMPLE_TEMPLATE = """ +import pandas as pd +from siuba import _, {verb} + +data = pd.DataFrame({data}) + +{verb}(data, result = {call_expr}) +""" + +EXAMPLE_TEMPLATE2 = """ +from siuba import _, show_query, {verb} +from siuba.sql import LazyTbl +from siuba.sql.utils import mock_sqlalchemy_engine + +engine = mock_sqlalchemy_engine("postgresql") +tbl = LazyTbl(engine, 'some_table', ['g', 'x', 'y']) + +query = tbl >> {verb}(result = {call_expr}) >> show_query() +""" + +EXAMPLE_TEMPLATE3 = """ +from siuba import group_by +query = tbl >> group_by(_.g) >> {verb}(result = {call_expr}) >> show_query() +""" + +def load_template(template, data, verb, call_expr): + loaded_str = template.format( + data = data, + verb = verb, + call_expr = str(entry['expr_frame']) + ) + + mode = FileMode() + pretty_code = format_str(loaded_str, mode = mode) + + return pretty_code + +def run_to_html_payload(name, code, shell): + with capture_output() as c: + res = shell.run_cell(code).result + + if isinstance(res, pd.DataFrame): + output = res.to_html() + else: + output = "
" + str(c) + "
" + + code_html = highlight(code, PythonLexer(), HtmlFormatter(prestyles = "text-align: left;")) + return {'name': name, 'input': code_html, 'output': output, "printed": str(c)} + +def create_code(entry, data, shell): + df = get_data(entry, data) + + + if entry['accessor'] == "dt": + df_repr = """ +{'g': ['a', 'a', 'b', 'b'], + 'x': pd.to_datetime(["2019-01-01 01:01:01", "2020-04-08 02:02:02","2021-07-15 03:03:03", "2022-10-22 04:04:04"]) + } +""" + else: + df_repr = repr(df.to_dict(orient = "list")) + + verb = "summarize" if entry['result']['type'] == "Agg" else "mutate" + + call_expr = str(entry['expr_frame']) + examples = [] + + pretty_code = load_template(EXAMPLE_TEMPLATE, df_repr, verb, call_expr) + examples.append( + run_to_html_payload('Pandas DataFrame', pretty_code, shell) + ) + + if entry['result'].get('postgresql') not in {"xfail", "not_impl"}: + pretty_code2 = load_template(EXAMPLE_TEMPLATE2, df_repr, verb, call_expr) + examples.append( + run_to_html_payload('SQL Table', pretty_code2, shell) + ) + + pretty_code3 = load_template(EXAMPLE_TEMPLATE3, df_repr, verb, call_expr) + examples.append( + run_to_html_payload('Grouped SQL Table', pretty_code3, shell) + ) + + + return examples + + + + +``` + +```{python} +from siuba.tests.test_dply_series_methods import get_data, DATA + +STATUS = {'done':'✅', 'xfail': '🚧', 'not_impl': '❌'} + +shell = InteractiveShell() +table = [] +for name, entry in spec.items(): + + # notes + notes = [] + sql_type = entry['result'].get('sql_type') + if sql_type: + notes.append("SQL returns a %s."%sql_type) + + # postgres stuff + no_mutate = entry['result'].get('no_mutate') + if no_mutate: + notes.append("Cannot be used in a mutate with %s"%",".join(no_mutate)) + postgresql = entry['result'].get('postgresql', 'done') + + # example + example_data = get_data(entry, DATA) + entry_type = entry.get('type') + + examples = create_code(entry, DATA, shell) + shell.reset() + + + table.append({ + 'name': name, + 'category': entry['category'], + 'data_arity': entry['data_arity'], + 'type': entry['result'].get('type'), + 'pandas': STATUS['done'], + 'postgresql': STATUS[postgresql], + 'expr_frame': str(entry['expr_frame']), + 'note': "\n".join(notes), + 'examples': examples + }) + +``` + +```{python} +from airtable import Airtable +import pandas as pd + +from siuba import filter, _, pipe + +airtable = Airtable('appErTNqCFXn6stSH', 'methods') + +res = airtable.get_all() + +air_methods = pd.io.json.json_normalize(res) +air_methods.columns = air_methods.columns.map(lambda s: s.split('.')[-1]) +air_methods.rename(columns = {'method_name': 'name'}, inplace = True) + +``` + +```{python} +AIR_STATUS = { + 'done':'✅', + 'priority-zero': '', + 'priority-low': '', + 'priority-medium': '🚧', + 'priority-high': '🚧', + 'no support': '❌' +} + +hidden_cats = {}#{'_special_methods', 'binary'} + +final_table = pd.DataFrame([x for x in table if x['category'] not in hidden_cats]) + +small_air = air_methods.loc[:, ['category', 'support_category', 'name']] +small_air['fast grouped'] = small_air.support_category.map(AIR_STATUS).fillna('❌') + +merged = small_air.merge(final_table.drop(columns = ['category']), how = "left", on = "name") +``` + +```{python} +from IPython.display import HTML +from qgrid import show_grid + +# TODO: +# * missing pandas methods (crnt only includes those impl for group by) +# * filterable on backend and status +# * include method doc? +# * replace "type" with "result" category +#HTML(tabulate(final_table, headers = "keys", tablefmt = "html")) +cols_to_keep = [ + "category", "name", + "fast grouped", "postgresql", + "note", "expr_frame", "support_category", + "examples" +] +final = (merged[cols_to_keep] + .fillna("") + .sort_values(["category", "name"]) + [lambda d: d["fast grouped"] != ""] + ) +HTML(tabulate(final, headers = "keys", tablefmt = "html")) + +final.to_json('../docs/_static/support-table/data.json', orient = 'records') + +``` + +## Create example method docs + +```{python} +print(HtmlFormatter().get_style_defs('.highlight')) +``` diff --git a/siuba/experimental/pd_groups/groupby.py b/siuba/experimental/pd_groups/groupby.py index 8c9c7261..1c76c3df 100644 --- a/siuba/experimental/pd_groups/groupby.py +++ b/siuba/experimental/pd_groups/groupby.py @@ -87,6 +87,12 @@ def grouper_match(grp1, grp2): def broadcast_group_elements(x, y): + """Returns 3-tuple of same-length x and y data, plus a reference group by object. + + Note: + * Raises error if x and y are not compatible group by objects. + * Will broadcast a GroupByAgg, to ensure same length as other data. + """ if all_isinstance(GroupByAgg, x, y) and x._orig_grouper is y._orig_grouper: return x.obj, y.obj, x diff --git a/siuba/experimental/pd_groups/translate.py b/siuba/experimental/pd_groups/translate.py index 1b49353f..8eec0044 100644 --- a/siuba/experimental/pd_groups/translate.py +++ b/siuba/experimental/pd_groups/translate.py @@ -2,82 +2,94 @@ import pandas as pd -def is_literal(el): - # TODO: pandas has this function--should use that - return isinstance(el, (int, float, str)) +# utilities ------------------------------------------------------------------- + +def _validate_data_args(x, **kwargs): + if not isinstance(x, SeriesGroupBy): + raise TypeError("First data argument must be a grouped Series object") + + for name, other_data in kwargs.items(): + if isinstance(other_data, pd.Series): + raise TypeError("{} may not be a Series.".format(name)) + + +def _apply_grouped_method(ser, name, is_property, accessor, args, kwargs): + if accessor: + method = getattr(getattr(ser, accessor), name) + else: + method = getattr(ser, name) + + res = method(*args, **kwargs) if not is_property else method + + return res + + +def _maybe_broadcast(x, y): + """Same as broadcast_group_elements, but y doesn't have to be SeriesGroupBy + + This is important when y is a literal (e.g. 1), since we don't want to raise + an error, or broadcast 1 to the length of x. Rather, we want to keep the literal, + and let the pandas series handle it in the operation. + + """ + if isinstance(y, SeriesGroupBy): + left, right, groupby = broadcast_group_elements(x, y) + else: + left, right, groupby = x.obj, y, x + + return left, right, groupby + + +# Translations ---------------------------------------------------------------- def not_implemented(name, is_property, accessor): return NotImplementedError + def method_agg_op(name, is_property, accessor): def f(__ser, *args, **kwargs): - if not isinstance(__ser, SeriesGroupBy): - raise TypeError("All methods must operate on grouped Series objects") - - method = getattr(__ser, name) + _validate_data_args(__ser) + res = _apply_grouped_method(__ser, name, is_property, accessor, args, kwargs) - res = method(*args, **kwargs) return GroupByAgg.from_result(res, __ser) - f.__name__ = name - f.__qualname__ = name + f.__name__ = f.__qualname__ = name return f + def method_el_op(name, is_property, accessor): def f(__ser, *args, **kwargs): - if not isinstance(__ser, SeriesGroupBy): - raise TypeError("All methods must operate on a grouped Series objects") - - if accessor: - method = getattr(getattr(__ser.obj, accessor), name) - else: - method = getattr(__ser.obj, name) + _validate_data_args(__ser) + res = _apply_grouped_method(__ser.obj, name, is_property, accessor, args, kwargs) - res = method(*args, **kwargs) if not is_property else method return _regroup(res, __ser) - f.__name__ = name - f.__qualname__ = name + f.__name__ = f.__qualname__ = name return f -def method_el_op2(name, **kwargs): + +def method_el_op2(name, is_property, accessor): def f(x, y): - if isinstance(x, pd.Series) or isinstance(y, pd.Series): - raise TypeError("No Series allowed") - - elif isinstance(x, SeriesGroupBy) and isinstance(y, SeriesGroupBy): - left, right, groupby = broadcast_group_elements(x, y) - elif is_literal(x): - right, left, groupby = x, y.obj, y - elif is_literal(y): - left, right, groupby = x.obj, y, x - else: - raise TypeError("All methods must operate on a grouped Series objects") + _validate_data_args(x, y = y) + left, right, groupby = _maybe_broadcast(x, y) op_function = getattr(left, name) res = op_function(right) return _regroup(res, groupby) - f.__name__ = name - f.__qualname__ = name + f.__name__ = f.__qualname__ = name return f + def method_win_op(name, is_property, accessor): def f(__ser, *args, **kwargs): - if not isinstance(__ser, SeriesGroupBy): - raise TypeError("All methods must operate on a grouped Series objects") - - if accessor: - method = getattr(getattr(__ser, accessor), name) - else: - method = getattr(__ser, name) + _validate_data_args(__ser) + res = _apply_grouped_method(__ser, name, is_property, accessor, args, kwargs) - res = method(*args, **kwargs) if not is_property else method return _regroup(res, __ser) - f.__name__ = name - f.__qualname__ = name + f.__name__ = f.__qualname__ = name return f diff --git a/siuba/spec/series.py b/siuba/spec/series.py index fae663b0..880479ad 100644 --- a/siuba/spec/series.py +++ b/siuba/spec/series.py @@ -1,5 +1,12 @@ from siuba.siu import Symbolic, strip_symbolic # TODO: dot, corr, cov +# ordered set aggregate. e.g. mode() +# hypothetical-set aggregate (e.g. rank(a) as if it were in partition(order_by b)) + +# kinds of windows: +# * result len n_elements: rank() +# * result len 1: is_monotonic (lag, diff, and any). ordered set aggregate. +# * result len input len: percentile_cont([.1, .2]). hypo set aggregate. _ = Symbolic() @@ -16,7 +23,6 @@ class Window(Result): pass class Singleton(Result): pass class WontImplement(Result): pass - CATEGORIES_TIME = { 'time_series', 'datetime_properties', 'datetime_methods', 'period_properties', 'timedelta_properties', 'timedelta_methods' @@ -31,8 +37,8 @@ class WontImplement(Result): pass # * doesn't work in mutate (no_mutate = ['postgresql']) # * returns float rather than int (sql_type = 'float') # * requires boolean or float input (op = 'bool') -# * won't be implemented (in some backend) (not_impl = ['postgresql']) -# * isn't implemented now, but will be (xfail = ['postgresql']) +# * won't be implemented (in some backend) (postgresql = 'not_impl') +# * isn't implemented now, but will be (postgresql = 'xfail') funcs = { ## ------------------------------------------------------------------------ # Attributes @@ -41,36 +47,36 @@ class WontImplement(Result): pass '__invert__': _.__invert__() >> Elwise(op = 'bool'), '__and__': _.__and__(_) >> Elwise(op = 'bool'), '__or__': _.__or__(_) >> Elwise(op = 'bool'), - '__xor__': _.__xor__(_) >> Elwise(op = 'bool', xfail = ['postgresql']), + '__xor__': _.__xor__(_) >> Elwise(op = 'bool', postgresql = 'xfail'), '__neg__': _.__neg__() >> Elwise(), - '__pos__': _.__pos__() >> Elwise(xfail = ['postgresql']), + '__pos__': _.__pos__() >> Elwise(postgresql = 'xfail'), '__rand__': _.__rand__(_) >> Elwise(op = 'bool'), '__ror__': _.__ror__(_) >> Elwise(op = 'bool'), - '__rxor__': _.__rxor__(_) >> Elwise(op = 'bool', xfail = ['postgresql']), + '__rxor__': _.__rxor__(_) >> Elwise(op = 'bool', postgresql = 'xfail'), # copied from binary section below '__add__': _.__add__(_) >> Elwise(), '__sub__': _.__sub__(_) >> Elwise(), - '__truediv__': _.__truediv__(_) >> Elwise(xfail = ['postgresql']), # TODO: pg needs cast int to float? - '__floordiv__': _.__floordiv__(_) >> Elwise(xfail = ['postgresql']), + '__truediv__': _.__truediv__(_) >> Elwise(postgresql = 'xfail'), # TODO: pg needs cast int to float? + '__floordiv__': _.__floordiv__(_) >> Elwise(postgresql = 'xfail'), '__mul__': _.__mul__(_) >> Elwise(), '__mod__': _.__mod__(_) >> Elwise(), - '__pow__': _.__pow__(_) >> Elwise(xfail = ['postgresql']), + '__pow__': _.__pow__(_) >> Elwise(postgresql = 'xfail'), '__lt__': _.__lt__(_) >> Elwise(), '__gt__': _.__gt__(_) >> Elwise(), '__le__': _.__le__(_) >> Elwise(), '__ge__': _.__ge__(_) >> Elwise(), '__ne__': _.__ne__(_) >> Elwise(), '__eq__': _.__eq__(_) >> Elwise(), - '__div__': _.__div__(_) >> Elwise(xfail = ['postgresql']), # TODO: deprecated in python3, not in siu - '__round__': _.__round__(2) >> Elwise(xfail = ['postgresql']), # TODO: pg returns float + '__div__': _.__div__(_) >> Elwise(postgresql = 'xfail'), # TODO: deprecated in python3, not in siu + '__round__': _.__round__(2) >> Elwise(postgresql = 'xfail'), # TODO: pg returns float '__radd__': _.__radd__(_) >> Elwise(), '__rsub__': _.__rsub__(_) >> Elwise(), '__rmul__': _.__rmul__(_) >> Elwise(), - '__rdiv__': _.__rdiv__(_) >> Elwise(xfail = ['postgresql']), - '__rtruediv__': _.__rtruediv__(_) >> Elwise(xfail = ['postgresql']), - '__rfloordiv__': _.__rfloordiv__(_) >> Elwise(xfail = ['postgresql']), + '__rdiv__': _.__rdiv__(_) >> Elwise(postgresql = 'xfail'), + '__rtruediv__': _.__rtruediv__(_) >> Elwise(postgresql = 'xfail'), + '__rfloordiv__': _.__rfloordiv__(_) >> Elwise(postgresql = 'xfail'), '__rmod__': _.__rmod__(_) >> Elwise(), - '__rpow__': _.__rpow__(_) >> Elwise(xfail = ['postgresql']), + '__rpow__': _.__rpow__(_) >> Elwise(postgresql = 'xfail'), }, 'attributes': { # method @@ -104,7 +110,7 @@ class WontImplement(Result): pass 'conversion': { 'astype': _.astype('str') >> Elwise(), # infer_objects - 'copy': _.copy() >> Elwise(not_impl = ['postgresql']), + 'copy': _.copy() >> Elwise(postgresql = 'not_impl'), # bool # to_numpy # to_period @@ -136,27 +142,27 @@ class WontImplement(Result): pass 'binary': { 'add': _.add(_) >> Elwise(), 'sub': _.sub(_) >> Elwise(), - 'truediv': _.truediv(_) >> Elwise(xfail = ['postgresql']), - 'floordiv': _.floordiv(_) >> Elwise(xfail = ['postgresql']), + 'truediv': _.truediv(_) >> Elwise(postgresql = 'xfail'), + 'floordiv': _.floordiv(_) >> Elwise(postgresql = 'xfail'), 'mul': _.mul(_) >> Elwise(), 'mod': _.mod(_) >> Elwise(), - 'pow': _.pow(_) >> Elwise(xfail = ['postgresql']), + 'pow': _.pow(_) >> Elwise(postgresql = 'xfail'), 'lt': _.lt(_) >> Elwise(), 'gt': _.gt(_) >> Elwise(), 'le': _.le(_) >> Elwise(), 'ge': _.ge(_) >> Elwise(), 'ne': _.ne(_) >> Elwise(), 'eq': _.eq(_) >> Elwise(), - 'div': _.div(_) >> Elwise(xfail = ['postgresql']), - 'round': _.round(2) >> Elwise(xfail = ['postgresql']), + 'div': _.div(_) >> Elwise(postgresql = 'xfail'), + 'round': _.round(2) >> Elwise(postgresql = 'xfail'), 'radd': _.radd(_) >> Elwise(), 'rsub': _.rsub(_) >> Elwise(), 'rmul': _.rmul(_) >> Elwise(), - 'rdiv': _.rdiv(_) >> Elwise(xfail = ['postgresql']), - 'rtruediv': _.rtruediv(_) >> Elwise(xfail = ['postgresql']), - 'rfloordiv': _.rfloordiv(_) >> Elwise(xfail = ['postgresql']), + 'rdiv': _.rdiv(_) >> Elwise(postgresql = 'xfail'), + 'rtruediv': _.rtruediv(_) >> Elwise(postgresql = 'xfail'), + 'rfloordiv': _.rfloordiv(_) >> Elwise(postgresql = 'xfail'), 'rmod': _.rmod(_) >> Elwise(), - 'rpow': _.rpow(_) >> Elwise(xfail = ['postgresql']), + 'rpow': _.rpow(_) >> Elwise(postgresql = 'xfail'), # combine # combine_first #'product': _.product() >> Agg(), # TODO: doesn't exist on GroupedDataFrame @@ -192,30 +198,30 @@ class WontImplement(Result): pass #'corr': _.corr(_) >> Agg(), 'count': _.count() >> Agg(), #'cov': _.cov(_) >> Agg(), - 'cummax': _.cummax() >> Window(xfail = ['postgresql']), - 'cummin': _.cummin() >> Window(xfail = ['postgresql']), - 'cumprod': _.cumprod() >> Window(xfail = ['postgresql']), - 'cumsum': _.cumsum() >> Window(xfail = ['postgresql']), + 'cummax': _.cummax() >> Window(postgresql = 'xfail'), + 'cummin': _.cummin() >> Window(postgresql = 'xfail'), + 'cumprod': _.cumprod() >> Window(postgresql = 'xfail'), + 'cumsum': _.cumsum() >> Window(postgresql = 'xfail'), # describe 'diff': _.diff() >> Window(), # factorize # 'kurt': _.kurt() >> Agg(), # TODO: doesn't exist on GDF - 'mad': _.mad() >> Agg(xfail = ['postgresql']), + 'mad': _.mad() >> Agg(postgresql = 'xfail'), 'max': _.max() >> Agg(), 'mean': _.mean() >> Agg(), - 'median': _.median() >> Agg(xfail = ['postgresql']), + 'median': _.median() >> Agg(postgresql = 'xfail'), 'min': _.min() >> Agg(), #'mode': _.mode() >> Agg(), # TODO: doesn't exist on GDF, can return > 1 result #'nlargest': _.nlargest() >> Window(), #'nsmallest': _.nsmallest() >> Window(), - 'pct_change': _.pct_change() >> Window(xfail = ['postgresql']), - 'prod': _.prod() >> Agg(xfail = ['postgresql']), + 'pct_change': _.pct_change() >> Window(postgresql = 'xfail'), + 'prod': _.prod() >> Agg(postgresql = 'xfail'), 'quantile': _.quantile(.75) >> Agg(no_mutate = ['postgresql']), - 'rank': _.rank() >> Window(xfail = ['postgresql']), - 'sem': _.sem() >> Agg(xfail = ['postgresql']), - 'skew': _.skew() >> Agg(xfail = ['postgresql']), + 'rank': _.rank() >> Window(postgresql = 'xfail'), + 'sem': _.sem() >> Agg(postgresql = 'xfail'), + 'skew': _.skew() >> Agg(postgresql = 'xfail'), 'std': _.std() >> Agg(), - 'sum': _.sum() >> Agg(xfail = ['postgresql']), # TODO: pg returns float + 'sum': _.sum() >> Agg(postgresql = 'xfail'), # TODO: pg returns float 'var': _.var() >> Agg(), #'kurtosis': _.kurtosis() >> Agg(), # TODO: doesn't exist on GDF # unique @@ -317,17 +323,17 @@ class WontImplement(Result): pass # Datetime properties ## ------------------------------------------------------------------------ 'datetime_properties': { - 'dt.date': _.dt.date >> Elwise(not_impl = ['postgresql']), # TODO: all 3, not pandas objects - 'dt.time': _.dt.time >> Elwise(not_impl = ['postgresql']), - 'dt.timetz': _.dt.timetz >> Elwise(not_impl = ['postgresql']), + 'dt.date': _.dt.date >> Elwise(postgresql = 'not_impl'), # TODO: all 3, not pandas objects + 'dt.time': _.dt.time >> Elwise(postgresql = 'not_impl'), + 'dt.timetz': _.dt.timetz >> Elwise(postgresql = 'not_impl'), 'dt.year': _.dt.year >> Elwise(sql_type = 'float'), 'dt.month': _.dt.month >> Elwise(sql_type = 'float'), 'dt.day': _.dt.day >> Elwise(sql_type = 'float'), 'dt.hour': _.dt.hour >> Elwise(sql_type = 'float'), 'dt.minute': _.dt.minute >> Elwise(sql_type = 'float'), 'dt.second': _.dt.second >> Elwise(sql_type = 'float'), - 'dt.microsecond': _.dt.microsecond >> Elwise(sql_type = 'float', xfail = ['postgresql']), - 'dt.nanosecond': _.dt.nanosecond >> Elwise(not_impl = ['postgresql']), + 'dt.microsecond': _.dt.microsecond >> Elwise(sql_type = 'float', postgresql = 'xfail'), + 'dt.nanosecond': _.dt.nanosecond >> Elwise(postgresql = 'not_impl'), 'dt.week': _.dt.week >> Elwise(sql_type = 'float'), 'dt.weekofyear': _.dt.weekofyear >> Elwise(sql_type = 'float'), 'dt.dayofweek': _.dt.dayofweek >> Elwise(sql_type = 'float'), @@ -337,10 +343,10 @@ class WontImplement(Result): pass 'dt.is_month_start': _.dt.is_month_start >> Elwise(), 'dt.is_month_end': _.dt.is_month_end >> Elwise(), 'dt.is_quarter_start': _.dt.is_quarter_start >> Elwise(), - 'dt.is_quarter_end': _.dt.is_quarter_end >> Elwise(xfail = ['postgresql']), + 'dt.is_quarter_end': _.dt.is_quarter_end >> Elwise(postgresql = 'xfail'), 'dt.is_year_start': _.dt.is_year_start >> Elwise(), 'dt.is_year_end': _.dt.is_year_end >> Elwise(), - 'dt.is_leap_year': _.dt.is_leap_year >> Elwise(not_impl = ['postgresql']), + 'dt.is_leap_year': _.dt.is_leap_year >> Elwise(postgresql = 'not_impl'), 'dt.daysinmonth': _.dt.daysinmonth >> Elwise(sql_type = 'float'), 'dt.days_in_month': _.dt.days_in_month >> Elwise(sql_type = 'float'), 'dt.tz': _.dt.tz >> Singleton(), @@ -350,17 +356,17 @@ class WontImplement(Result): pass # Datetime methods ## ------------------------------------------------------------------------ 'datetime_methods': { - 'dt.to_period': _.dt.to_period('D') >> Elwise(xfail = ["postgresql"]), + 'dt.to_period': _.dt.to_period('D') >> Elwise(postgresql = 'xfail'), # dt.to_pydatetime # TODO: datetime objects converted back to numpy? - 'dt.tz_localize': _.dt.tz_localize('UTC') >> Elwise(xfail = ["postgresql"]), + 'dt.tz_localize': _.dt.tz_localize('UTC') >> Elwise(postgresql = 'xfail'), # dt.tz_convert # TODO: need custom test - 'dt.normalize': _.dt.normalize() >> Elwise(xfail = ["postgresql"]), - 'dt.strftime': _.dt.strftime('%d') >> Elwise(xfail = ["postgresql"]), - 'dt.round': _.dt.round('D') >> Elwise(xfail = ["postgresql"]), - 'dt.floor': _.dt.floor('D') >> Elwise(xfail = ["postgresql"]), - 'dt.ceil': _.dt.ceil('D') >> Elwise(xfail = ["postgresql"]), - 'dt.month_name': _.dt.month_name() >> Elwise(xfail = ["postgresql"]), - 'dt.day_name': _.dt.day_name() >> Elwise(xfail = ["postgresql"]), + 'dt.normalize': _.dt.normalize() >> Elwise(postgresql = 'xfail'), + 'dt.strftime': _.dt.strftime('%d') >> Elwise(postgresql = 'xfail'), + 'dt.round': _.dt.round('D') >> Elwise(postgresql = 'xfail'), + 'dt.floor': _.dt.floor('D') >> Elwise(postgresql = 'xfail'), + 'dt.ceil': _.dt.ceil('D') >> Elwise(postgresql = 'xfail'), + 'dt.month_name': _.dt.month_name() >> Elwise(postgresql = 'xfail'), + 'dt.day_name': _.dt.day_name() >> Elwise(postgresql = 'xfail'), }, ## ------------------------------------------------------------------------ # Period properties @@ -394,55 +400,55 @@ class WontImplement(Result): pass 'str.capitalize': _.str.capitalize() >> Elwise(), #'str.casefold': _.str.casefold() >> Elwise(), #TODO: introduced in v0.25.1 # str.cat #TODO: can be Agg OR Elwise, others arg - 'str.center': _.str.center(3) >> Elwise(not_impl = ['postgresql']), + 'str.center': _.str.center(3) >> Elwise(postgresql = 'not_impl'), 'str.contains': _.str.contains('a') >> Elwise(), - 'str.count': _.str.count('a') >> Elwise(xfail = ['postgresql']), + 'str.count': _.str.count('a') >> Elwise(postgresql = 'xfail'), # str.decode # TODO custom testing - 'str.encode': _.str.encode('utf-8') >> Elwise(xfail = ['postgresql']), - 'str.endswith': _.str.endswith('a|b') >> Elwise(xfail = ['postgresql']), + 'str.encode': _.str.encode('utf-8') >> Elwise(postgresql = 'xfail'), + 'str.endswith': _.str.endswith('a|b') >> Elwise(postgresql = 'xfail'), #'str.extract': _.str.extract('(a)(b)') # TODO: returns DataFrame # str.extractall - 'str.find': _.str.find('a|c') >> Elwise(xfail = ['postgresql']), - 'str.findall': _.str.findall('a|c') >> Elwise(xfail = ['postgresql']), + 'str.find': _.str.find('a|c') >> Elwise(postgresql = 'xfail'), + 'str.findall': _.str.findall('a|c') >> Elwise(postgresql = 'xfail'), # str.get # TODO: custom test # str.index # TODO: custom test # str.join # TODO: custom test 'str.len': _.str.len() >> Elwise(), - 'str.ljust': _.str.ljust(5) >> Elwise(xfail = ['postgresql']), # pg formatstr function + 'str.ljust': _.str.ljust(5) >> Elwise(postgresql = 'xfail'), # pg formatstr function 'str.lower': _.str.lower() >> Elwise(), 'str.lstrip': _.str.lstrip() >> Elwise(), - 'str.match': _.str.match('a|c') >> Elwise(xfail = ['postgresql']), + 'str.match': _.str.match('a|c') >> Elwise(postgresql = 'xfail'), # str.normalize - 'str.pad': _.str.pad(5) >> Elwise(xfail = ['postgresql']), + 'str.pad': _.str.pad(5) >> Elwise(postgresql = 'xfail'), # str.partition # str.repeat - 'str.replace': _.str.replace('a|b', 'c') >> Elwise(xfail = ['postgresql']), - 'str.rfind': _.str.rfind('a') >> Elwise(xfail = ['postgresql']), + 'str.replace': _.str.replace('a|b', 'c') >> Elwise(postgresql = 'xfail'), + 'str.rfind': _.str.rfind('a') >> Elwise(postgresql = 'xfail'), # str.rindex - 'str.rjust': _.str.rjust(5) >> Elwise(xfail = ['postgresql']), + 'str.rjust': _.str.rjust(5) >> Elwise(postgresql = 'xfail'), # str.rpartition 'str.rstrip': _.str.rstrip() >> Elwise(), - 'str.slice': _.str.slice(step = 2) >> Elwise(xfail = ['postgresql']), - 'str.slice_replace': _.str.slice_replace(2, repl = 'x') >> Elwise(xfail = ['postgresql']), - 'str.split': _.str.split('a|b') >> Elwise(xfail = ['postgresql']), - 'str.rsplit': _.str.rsplit('a|b', n = 1) >> Elwise(xfail = ['postgresql']), + 'str.slice': _.str.slice(step = 2) >> Elwise(postgresql = 'xfail'), + 'str.slice_replace': _.str.slice_replace(2, repl = 'x') >> Elwise(postgresql = 'xfail'), + 'str.split': _.str.split('a|b') >> Elwise(postgresql = 'xfail'), + 'str.rsplit': _.str.rsplit('a|b', n = 1) >> Elwise(postgresql = 'xfail'), 'str.startswith': _.str.startswith('a|b') >> Elwise(), 'str.strip': _.str.strip() >> Elwise(), - 'str.swapcase': _.str.swapcase() >> Elwise(xfail = ['postgresql']), + 'str.swapcase': _.str.swapcase() >> Elwise(postgresql = 'xfail'), 'str.title': _.str.title() >> Elwise(), # str.translate 'str.upper': _.str.upper() >> Elwise(), - 'str.wrap': _.str.wrap(2) >> Elwise(xfail = ['postgresql']), + 'str.wrap': _.str.wrap(2) >> Elwise(postgresql = 'xfail'), # str.zfill - 'str.isalnum': _.str.isalnum() >> Elwise(xfail = ['postgresql']), - 'str.isalpha': _.str.isalpha() >> Elwise(xfail = ['postgresql']), - 'str.isdigit': _.str.isdigit() >> Elwise(xfail = ['postgresql']), - 'str.isspace': _.str.isspace() >> Elwise(xfail = ['postgresql']), - 'str.islower': _.str.islower() >> Elwise(xfail = ['postgresql']), - 'str.isupper': _.str.isupper() >> Elwise(xfail = ['postgresql']), - 'str.istitle': _.str.istitle() >> Elwise(xfail = ['postgresql']), - 'str.isnumeric': _.str.isnumeric() >> Elwise(xfail = ['postgresql']), - 'str.isdecimal': _.str.isdecimal() >> Elwise(xfail = ['postgresql']), + 'str.isalnum': _.str.isalnum() >> Elwise(postgresql = 'xfail'), + 'str.isalpha': _.str.isalpha() >> Elwise(postgresql = 'xfail'), + 'str.isdigit': _.str.isdigit() >> Elwise(postgresql = 'xfail'), + 'str.isspace': _.str.isspace() >> Elwise(postgresql = 'xfail'), + 'str.islower': _.str.islower() >> Elwise(postgresql = 'xfail'), + 'str.isupper': _.str.isupper() >> Elwise(postgresql = 'xfail'), + 'str.istitle': _.str.istitle() >> Elwise(postgresql = 'xfail'), + 'str.isnumeric': _.str.isnumeric() >> Elwise(postgresql = 'xfail'), + 'str.isdecimal': _.str.isdecimal() >> Elwise(postgresql = 'xfail'), # str.get_dummies }, 'categories': { @@ -518,6 +524,7 @@ class WontImplement(Result): pass nested_spec[category] = d = {} for name, call in call_dict.items(): d[name] = get_type_info(call) + d[name]['category'] = category spec = dict(itertools.chain(*iter(d.items() for d in nested_spec.values()))) diff --git a/siuba/sql/dialects/postgresql.py b/siuba/sql/dialects/postgresql.py index 924349af..ba27280d 100644 --- a/siuba/sql/dialects/postgresql.py +++ b/siuba/sql/dialects/postgresql.py @@ -2,7 +2,7 @@ from ..translate import ( SqlColumn, SqlColumnAgg, base_scalar, base_agg, base_win, SqlTranslator, - win_agg, sql_scalar, + win_agg, sql_scalar, sql_agg ) import sqlalchemy.sql.sqltypes as sa_types from sqlalchemy import sql @@ -53,14 +53,19 @@ def sql_func_contains(col, pat, case = True, flags = 0, na = None, regex = True) ) aggregate = SqlTranslator( - base_agg + base_agg, + all = sql_agg("bool_and"), + any = sql_agg("bool_or"), + std = sql_agg("stddev_samp"), + var = sql_agg("var_samp"), ) window = SqlTranslator( base_win, any = win_agg("bool_or"), all = win_agg("bool_and"), - lag = win_agg("lag") + lag = win_agg("lag"), + var = win_agg("var_samp"), ) funcs = dict(scalar = scalar, aggregate = aggregate, window = window) diff --git a/siuba/sql/utils.py b/siuba/sql/utils.py index 8e812b81..53a7b0d2 100644 --- a/siuba/sql/utils.py +++ b/siuba/sql/utils.py @@ -14,3 +14,28 @@ def get_sql_classes(name): 'aggregate': getattr(mod, agg_name) } + +def mock_sqlalchemy_engine(dialect): + """ + Create a sqlalchemy.engine.Engine without it connecting to a database. + + Examples + -------- + + :: + from siuba.sql import LazyTbl + from siuba import _, mutate, show_query + + engine = mock_sqlalchemy_engine('postgresql') + tbl = LazyTbl(engine, 'some_table', ['x']) + + query = mutate(tbl, y = _.x + _.x) + show_query(query) + + """ + from sqlalchemy.engine import Engine + from sqlalchemy.dialects import registry + + dialect_cls = registry.load('postgresql') + return Engine(None, dialect_cls(), '') + diff --git a/siuba/tests/test_dply_series_methods.py b/siuba/tests/test_dply_series_methods.py index 2a3074c9..0159a191 100644 --- a/siuba/tests/test_dply_series_methods.py +++ b/siuba/tests/test_dply_series_methods.py @@ -1,6 +1,6 @@ from siuba.siu import Symbolic, strip_symbolic from siuba.spec.series import spec -from .helpers import data_frame, assert_equal_query, backend_pandas, SqlBackend +from .helpers import data_frame, assert_equal_query, backend_pandas, SqlBackend, PandasBackend import pytest # TODO: dot, corr, cov @@ -69,20 +69,58 @@ def assert_src_array_equal(src, dst): y = [1,2,3,4,5,6] ) -data = { +DATA = data = { 'dt': data_dt, 'str': data_str, None: data_default, 'bool': data_bool } -# Tests ======================================================================= +def get_data(entry, data, backend = None): + + req_bool = entry['result'].get('op') == 'bool' + + # pandas is forgiving to bool inputs + if isinstance(backend, PandasBackend): + req_bool = False + + return data['bool'] if req_bool else data[entry['accessor']] + -# Series expr and call return same result +def test_missing_implementation(entry, backend): + # Check whether test should xfail, skip, or ------------------------------- + backend_status = entry['result'].get(backend.name) -# Series expr and Postgres return same result + # case: Needs to be implmented + # TODO(table): uses xfail + if backend_status == "xfail": + pytest.xfail("TODO: impelement this translation") + + # case: Can't be used in a mutate (e.g. a SQL ordered set aggregate function) + # TODO(table): no_mutate + if backend.name in entry['result'].get('no_mutate', []): + pytest.skip("Spec'd failure") + + # case: won't be implemented + if entry['result'].get(backend.name) == "not_impl": + pytest.skip() + + +def get_df_expr(entry): + str_expr = str(entry['expr_frame']) + call_expr = strip_symbolic(eval(str_expr, {'_': _})) + + return str_expr, call_expr + +def cast_result_type(entry, backend, ser): + sql_type = entry['result'].get('sql_type') + if isinstance(backend, SqlBackend) and sql_type == 'float': + return ser.astype('float') + + return ser + +# Tests ======================================================================= -# Series agg and trivial group agg return same result (when cast dimless) def test_series_against_call(entry): if entry['result']['type'] == "Window": @@ -102,6 +140,7 @@ def test_series_against_call(entry): def test_frame_expr(entry): + # TODO: remove this test, not checking anything new df = data[entry['accessor']] # TODO: once reading from yaml, no need to repr str_expr = str(entry['expr_frame']) @@ -115,104 +154,106 @@ def test_frame_expr(entry): assert_src_array_equal(res, dst) -#@backend_pandas -@pytest.mark.skip_backend('sqlite') -def test_frame_mutate(skip_backend, backend, entry): - # CASE 1: Needs to be implmented - if backend.name in entry['result'].get('xfail', []): - pytest.xfail("TODO: impelement this translation") - - # CASE 2: Can't be used in a mutate (e.g. a SQL ordered set aggregate function) - if backend.name in entry['result'].get('no_mutate', []): - pytest.skip("Spec'd failure") - - # CASE 3: Uses an operation that can only take boolean inputs - if isinstance(backend, SqlBackend) and entry['result'].get('op') == 'bool': - crnt_data = data['bool'] - - else: - crnt_data = data[entry['accessor']] - - df = backend.load_df(crnt_data) +def test_pandas_grouped_frame_fast_not_implemented(notimpl_entry): + from siuba.experimental.pd_groups.dialect import fast_mutate + gdf = data[notimpl_entry['accessor']].groupby('g') # TODO: once reading from yaml, no need to repr - str_expr = str(entry['expr_frame']) + str_expr = str(notimpl_entry['expr_frame']) call_expr = strip_symbolic(eval(str_expr, {'_': _})) - dst_series = eval(str_expr, {'_': crnt_data}) - dst = crnt_data.assign(result = dst_series) + with pytest.raises(NotImplementedError): + res = fast_mutate(gdf, result = call_expr) - # CASE 4: marked as NotImplemented (meaning no plan to implement) - if backend.name in entry['result'].get('not_impl', []): - with pytest.raises(NotImplementedError): - mutate(df, result = call_expr) - # we're done - return - # CASE 5: - if isinstance(backend, SqlBackend) and entry['result'].get('sql_type') == 'float': - dst['result'] = dst['result'].astype('float') +#@backend_pandas +@pytest.mark.skip_backend('sqlite') +def test_frame_mutate(skip_backend, backend, entry): + test_missing_implementation(entry, backend) - # otherwise, verify returns same result as mutate - assert_equal_query(df, mutate(result = call_expr), dst) + # Prepare input data ------------------------------------------------------ + # case: inputs must be boolean + crnt_data = get_data(entry, DATA, backend) + df = backend.load_df(crnt_data) + # Execute mutate ---------------------------------------------------------- + str_expr, call_expr = get_df_expr(entry) -def test_pandas_grouped_frame_fast_not_implemented(notimpl_entry): - from siuba.experimental.pd_groups.dialect import fast_mutate - gdf = data[notimpl_entry['accessor']].groupby('g') + # Run test for equality w/ ungrouped pandas ---- + dst = crnt_data.assign(result = call_expr(crnt_data)) + dst['result'] = cast_result_type(entry, backend, dst['result']) - # TODO: once reading from yaml, no need to repr - str_expr = str(notimpl_entry['expr_frame']) - call_expr = strip_symbolic(eval(str_expr, {'_': _})) + assert_equal_query( + df, + mutate(result = call_expr), + dst + ) + + # Run test for equality w/ grouped pandas ---- + g_dst = crnt_data.groupby('g').apply(lambda d: d.assign(result = call_expr)).reset_index(drop = True) + g_dst['result'] = cast_result_type(entry, backend, g_dst['result']) + assert_equal_query( + df, + group_by(_.g) >> mutate(result = call_expr), + g_dst + ) - with pytest.raises(NotImplementedError): - res = fast_mutate(gdf, result = call_expr) - def test_pandas_grouped_frame_fast_mutate(entry): from siuba.experimental.pd_groups.dialect import fast_mutate, DataFrameGroupBy - gdf = data[entry['accessor']].groupby('g') + gdf = get_data(entry, DATA).groupby('g') - # TODO: once reading from yaml, no need to repr - str_expr = str(entry['expr_frame']) - call_expr = strip_symbolic(eval(str_expr, {'_': _})) + # Execute mutate ---------------------------------------------------------- + str_expr, call_expr = get_df_expr(entry) res = fast_mutate(gdf, result = call_expr) dst = mutate(gdf, result = call_expr) - # fix mutate's current bad behavior of reordering rows --- - # (fixed in issue #139) - dst_obj_fixed = dst.obj - # TODO: apply mark to skip failing tests, rather than downcast # pandas grouped aggs, when not using cython, _try_cast back to original type # but since mutate uses apply, it doesn't :/. Currently only affects median func. + dst_obj = dst.obj if str_expr == '_.x.median()': - dst_obj_fixed['result'] = gdf._try_cast(dst_obj_fixed['result'], gdf.x.obj) + dst_obj['result'] = gdf._try_cast(dst_obj['result'], gdf.x.obj) assert isinstance(dst, DataFrameGroupBy) - assert_frame_equal(res.obj, dst_obj_fixed) + assert_frame_equal(res.obj, dst_obj) + +@pytest.mark.skip_backend('sqlite') +def test_frame_summarize(skip_backend, backend, agg_entry): + entry = agg_entry + test_missing_implementation(entry, backend) -#@pytest.mark.skip_backend('sqlite') -@backend_pandas -def test_frame_summarize_trivial(backend, agg_entry): - crnt_data = data[agg_entry['accessor']] + # Prepare input data ------------------------------------------------------ + # case: inputs must be boolean + crnt_data = get_data(entry, DATA, backend) df = backend.load_df(crnt_data) - # TODO: once reading from yaml, no need to repr - str_expr = str(agg_entry['expr_frame']) + # Execute mutate ---------------------------------------------------------- + str_expr, call_expr = get_df_expr(entry) - call_expr = strip_symbolic(eval(str_expr, {'_': _})) - res = summarize(df, result = call_expr) + dst = data_frame(result = call_expr(crnt_data)) - # Perform a trivial group agg, where the entire frame is 1 group - dst_out = eval(str_expr, {'_': df}) - dst_series = dst_out if isinstance(dst_out, pd.Series) else pd.Series(dst_out) - dst = pd.DataFrame({'result': dst_series}) - - assert_frame_equal(res, dst) + # Process output ---------------------------------------------------------- + # case: output is of a different type than w/ pandas + dst['result'] = cast_result_type(entry, backend, dst['result']) + + # Run test for equality w/ pandas ---- + # otherwise, verify returns same result as mutate + assert_equal_query( + df, + summarize(result = call_expr), + dst + ) + + dst_g = crnt_data.groupby('g').apply(call_expr).reset_index().rename(columns = {0: 'result'}) + assert_equal_query( + df, + group_by(_.g) >> summarize(result = call_expr), + dst_g + ) # Edge Cases ==================================================================