Skip to content

Commit

Permalink
fix: fct_collapse preserves missing values (#262)
Browse files Browse the repository at this point in the history
  • Loading branch information
machow authored Aug 14, 2020
1 parent cdeb465 commit 8e12b4a
Showing 1 changed file with 16 additions and 4 deletions.
20 changes: 16 additions & 4 deletions siuba/dply/forcats.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import pandas as pd
import numpy as np

from siuba.siu import symbolic_dispatch
from collections import defaultdict

# fct_reorder -----------------------------------------------------------------

Expand Down Expand Up @@ -104,6 +106,10 @@ def fct_collapse(fct, recat, group_other = None) -> pd.Categorical:
['ab', 'ab', 'c']
Categories (2, object): ['ab', 'c']
>>> fct_collapse(['a', 'b', None], {'a': ['b']})
['a', 'a', NaN]
Categories (1, object): ['a']
"""
if not isinstance(fct, pd.Categorical):
fct = pd.Categorical(fct)
Expand All @@ -130,12 +136,18 @@ def fct_collapse(fct, recat, group_other = None) -> pd.Categorical:
# map from old cat to new code ----
# calculate new codes
ordered_cats = {new: True for old, new in cat_to_new.items()}

new_cat_set = {k: ii for ii, k in enumerate(ordered_cats)}
# map old cats to new codes
remap_code = {old: new_cat_set[new] for old, new in cat_to_new.items()}

new_codes = fct.map(remap_code)
new_cats = list(new_cat_set.keys())
# make an array, where the index is old code + 1 (so missing val index is 0)
old_code_to_new = np.array(
[-1] + [new_cat_set[new_cat] for new_cat in cat_to_new.values()]
)

# map old cats to new codes
#remap_code = {old: new_cat_set[new] for old, new in cat_to_new.items()}
new_codes = old_code_to_new[fct.codes + 1]
new_cats = list(new_cat_set)
return pd.Categorical.from_codes(new_codes, new_cats)


Expand Down

0 comments on commit 8e12b4a

Please sign in to comment.