From 6aaab71e430c87deee18ee103b2135ac19f6dc17 Mon Sep 17 00:00:00 2001 From: Maxime Beauchemin Date: Fri, 6 Apr 2018 01:04:12 +0000 Subject: [PATCH 1/2] [druid] fix 'Unorderable types' when col has nuls Error "unorderable types: str() < int()" occurs when grouping by a numerical Druid colummn that contains null values. * druid/pydruid returns strings in the datafram with NAs for nulls * Superset has custom logic around get_fillna_for_col that fills in the NULLs based on declared column type (FLOAT here), so now we have a mixed bag of type in the series * pandas chokes on pivot_table or groupby operations as it cannot sorts mixed types The approach here is to stringify and fillna('') to get a consistent series. --- superset/connectors/druid/models.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/superset/connectors/druid/models.py b/superset/connectors/druid/models.py index 25c68acae1283..b9263129f2a94 100644 --- a/superset/connectors/druid/models.py +++ b/superset/connectors/druid/models.py @@ -1277,6 +1277,21 @@ def run_query( # noqa / druid client.query_builder.last_query.query_dict, indent=2) return query_str + @staticmethod + def homogenize_types(df, groupby_cols): + """Converting all GROUPBY columns to strings + + When grouping by a numeric (say FLOAT) column, pydruid returns + strings in the dataframe. This creates issues downstream related + to having mixed types in the dataframe + + Here we replace None with and make the whole series a + str instead of an object. + """ + for col in groupby_cols: + df[col] = df[col].fillna('').astype(str) + return df + def query(self, query_obj): qry_start_dttm = datetime.now() client = self.cluster.get_pydruid_client() @@ -1284,6 +1299,8 @@ def query(self, query_obj): client=client, query_obj=query_obj, phase=2) df = client.export_pandas() + df = self.homogenize_types(df, query_obj.get('groupby', [])) + if df is None or df.size == 0: raise Exception(_('No data was returned.')) df.columns = [ From 5e4888af0a65cb725bca642a8903bd98d9dc3206 Mon Sep 17 00:00:00 2001 From: Maxime Beauchemin Date: Fri, 6 Apr 2018 04:58:02 +0000 Subject: [PATCH 2/2] typo --- tests/druid_tests.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/druid_tests.py b/tests/druid_tests.py index b0d9caff76b70..868cd5e2d51f8 100644 --- a/tests/druid_tests.py +++ b/tests/druid_tests.py @@ -61,6 +61,7 @@ def __reduce__(self): 'timestamp': '2012-01-01T00:00:00.000Z', 'event': { 'dim1': 'Canada', + 'dim2': 'boy', 'metric1': 12345678, }, }, @@ -69,6 +70,7 @@ def __reduce__(self): 'timestamp': '2012-01-01T00:00:00.000Z', 'event': { 'dim1': 'USA', + 'dim2': 'girl', 'metric1': 12345678 / 2, }, }, @@ -165,7 +167,7 @@ def test_client(self, PyDruid): 'row_limit': 5000, 'include_search': 'false', 'metrics': ['count'], - 'groupby': ['dim1', 'dim2d'], + 'groupby': ['dim1', 'dim2'], 'force': 'true', } # two groupby