data-engineering-collective · xhochy · Oct 24, 2022 · Oct 19, 2022 · Oct 19, 2022 · Oct 19, 2022
@@ -2,6 +2,11 @@
 Changelog
 =========
 
+Plateau 4.1.3 (2022-10-24)
+==========================
+
+* Patch to load partition with only nulls as categorical (#55)
+
 Plateau 4.1.2 (2022-10-20)
 ==========================
 

@@ -288,7 +288,15 @@ def _restore_dataframe(
                 )
 
         table = _reset_dictionary_columns(table, exclude=categories)
-        df = table.to_pandas(categories=categories, date_as_object=date_as_object)
+
+        df = table.to_pandas(date_as_object=date_as_object)
+
+        # XXX: Patch until Pyarrow bug is resolved: https://issues.apache.org/jira/browse/ARROW-18099?filter=-2
+        if categories:
+            for col in categories:
+                if col in df:
+                    df[col] = df[col].astype("category")
+
         df.columns = df.columns.map(ensure_unicode_string_type)
         if predicates:
             df = filter_df_from_predicates(

@@ -429,7 +429,20 @@ def test_read_categorical(store):
     assert df.dtypes["col"] == pd.CategoricalDtype(["a"], ordered=False)
 
 
-def test_read_categorical_empty(store):
+def test_read_empty_categorical(store):
+    df = pd.DataFrame({"col": [None]}).astype({"col": "category"})
+
+    serialiser = ParquetSerializer()
+    key = serialiser.store(store, "prefix", df)
+
+    df = serialiser.restore_dataframe(store, key)
+    assert df.dtypes["col"] == "O"
+
+    df = serialiser.restore_dataframe(store, key, categories=["col"])
+    assert df.dtypes["col"] == pd.CategoricalDtype([], ordered=False)
+
+
+def test_read_categorical_empty_dataframe(store):
 
     df = pd.DataFrame({"col": ["a"]}).astype({"col": "category"}).iloc[:0]
     serialiser = ParquetSerializer()