Skip to content

Commit

Permalink
fix missing level, adds some new
Browse files Browse the repository at this point in the history
  • Loading branch information
drizk1 committed Aug 24, 2024
1 parent ddc84b9 commit 191e08e
Show file tree
Hide file tree
Showing 5 changed files with 258 additions and 19 deletions.
6 changes: 3 additions & 3 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "TidierCats"
uuid = "79ddc9fe-4dbf-4a56-a832-df41fb326d23"
authors = ["Daniel Rizk"]
version = "0.1.1"
version = "0.1.2"

[deps]
CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
Expand All @@ -10,10 +10,10 @@ Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"

[compat]
CategoricalArrays = "0.10"
CategoricalArrays = "0.10, 1.0"
DataFrames = "1.5"
Reexport = "0.2, 1"
julia = "1.6"
julia = "1.9"

[extras]
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
Expand Down
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@
- `cat_collapse()`
- `cat_lump_min()`
- `cat_lump_prop()`
- `cat_recode()`
- `cat_other()`
- `cat_replace_missing()`
- `as_categorical()`

## Installation
Expand Down
3 changes: 3 additions & 0 deletions docs/src/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,7 @@ In addition, this package includes:
- `cat_collapse()`
- `cat_lump_min()`
- `cat_lump_prop()`
- `cat_recode()`
- `cat_other()`
- `cat_replace_missing()`
- `as_categorical()`
152 changes: 147 additions & 5 deletions src/TidierCats.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ using Reexport
@reexport using CategoricalArrays

export cat_rev, cat_relevel, cat_infreq, cat_lump, cat_reorder, cat_collapse, cat_lump_min, cat_lump_prop
export as_categorical, as_integer
export as_categorical, as_integer, cat_replace_missing, cat_other, cat_recode
include("catsdocstrings.jl")

"""
Expand All @@ -24,10 +24,58 @@ end
"""
$docstring_cat_relevel
"""
function cat_relevel(cat_array::CategoricalArray, levels_order::Vector{String})
ordered_levels = [x for x in levels_order if x in levels(cat_array)]
append!(ordered_levels, [x for x in levels(cat_array) if x ordered_levels])
new_cat_array = CategoricalArray([String(v) for v in cat_array], ordered=true, levels=ordered_levels)
function cat_relevel(cat_array::CategoricalArray{Union{Missing, String}}, levels_order::Vector{Union{String, Missing}})
unwrapped_levels = unwrap.(levels(cat_array))
ordered_levels = [x for x in levels_order if !ismissing(x) && x in unwrapped_levels]
if any(ismissing, levels_order) && any(ismissing, unwrapped_levels)
push!(ordered_levels, missing)
end
append!(ordered_levels, [x for x in unwrapped_levels if !ismissing(x) && x ordered_levels])
levels!(cat_array, ordered_levels)
return cat_array
end

function cat_relevel(cat_array, levels_order::Vector{String}; after::Int = 0)
current_levels = levels(cat_array)

# Separate levels into those mentioned in levels_order and those not
mentioned_levels = [x for x in levels_order if x in current_levels]
unmentioned_levels = [x for x in current_levels if x mentioned_levels]

# Determine where to insert the mentioned levels
if after == 0
new_levels = vcat(mentioned_levels, unmentioned_levels)
elseif after > 0 && after <= length(current_levels)
before = current_levels[1:after]
after_levels = current_levels[(after+1):end]
new_levels = vcat(
[l for l in before if l mentioned_levels],
[l for l in after_levels if l mentioned_levels],
mentioned_levels
)
# Move mentioned levels to the correct position
mentioned_set = Set(mentioned_levels)
insert_pos = after + 1
for (i, level) in enumerate(new_levels)
if i > after && level mentioned_set
insert_pos = i
break
end
end
new_levels = vcat(
new_levels[1:(insert_pos-1)],
mentioned_levels,
new_levels[insert_pos:end]
)
new_levels = unique(new_levels) # Remove any duplicates
else
error("'after' must be between 0 and the number of levels")
end

# Create a new CategoricalArray with the updated level order
new_cat_array = copy(cat_array)
levels!(new_cat_array, new_levels)

return new_cat_array
end

Expand Down Expand Up @@ -188,4 +236,98 @@ function as_integer(cat_array::CategoricalArray)
return CategoricalArrays.levelcode.(cat_array)
end

"""
$docstring_cat_replace_missing
"""
function cat_replace_missing(cat_array::CategoricalArray{Union{Missing, String}}, txt::String)
replace(cat_array, missing => txt)
end

"""
$docstring_cat_other
"""
function cat_other(f::Union{CategoricalArray, AbstractVector};
keep::Union{Nothing, Vector{String}} = nothing,
drop::Union{Nothing, Vector{String}} = nothing,
other_level::String = "Other")

if !isnothing(keep) && !isnothing(drop)
error("Only one of 'keep' or 'drop' should be specified, not both.")
end

if isnothing(keep) && isnothing(drop)
error("Either 'keep' or 'drop' must be specified.")
end

# Convert to CategoricalArray if it's not already
if !(f isa CategoricalArray)
f = categorical(f)
end

current_levels = levels(f)

if !isnothing(keep)
levels_to_change = setdiff(current_levels, keep)
else # drop is specified
levels_to_change = intersect(current_levels, drop)
end

# Create a new CategoricalArray
new_f = copy(f)

# Replace levels
for level in levels_to_change
new_f[new_f .== level] .= other_level
end

# Ensure 'other_level' is at the end of levels
new_levels = union(setdiff(current_levels, levels_to_change), [other_level])
levels!(new_f, new_levels)

return new_f
end


"""
$docstring_cat_recode
"""
function cat_recode(f::Union{CategoricalArray, AbstractVector}; kwargs...)
# Convert to CategoricalArray if it's not already
if !(f isa CategoricalArray)
f = categorical(f)
end

# Create a new CategoricalArray
new_f = copy(f)

# Iterate over the keyword arguments
for (new_level, old_levels) in kwargs
old_levels_str = [String(level) for level in old_levels] # Convert to string if needed

if new_level === nothing
# Remove the old levels by setting them to missing
for old_level in old_levels_str
new_f[new_f .== old_level] .= missing
end
else
new_level_str = String(new_level) # Convert new level to string
# Recode the old levels to the new level
for old_level in old_levels_str
if old_level in levels(new_f)
new_f[new_f .== old_level] .= new_level_str
else
@warn "Unknown level in input factor: $old_level"
end
end
end
end

# Clean up the levels (remove missing levels)
levels!(new_f, unique(skipmissing(new_f)))

return new_f
end



end
113 changes: 102 additions & 11 deletions src/catsdocstrings.jl
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,14 @@ julia> cat_rev(cat_array)

const docstring_cat_relevel =
"""
cat_relevel(cat_array::CategoricalArray, levels_order::Vector{String})
cat_relevel(cat_array::CategoricalArray, levels_order::Vector{String}, after::Int=0)
Reorders the levels in a categorical array according to the provided order.
# Arguments
`cat_array`: Input categorical array.
`levels_order`: Vector of levels in the desired order.
`after`: Position after which to insert the new levels. Default is ignored
# Returns
Categorical array with levels reordered according to levels_order.
Expand All @@ -59,14 +59,16 @@ julia> cat_array = CategoricalArray(["A", "B", "C", "A", "B", "B"], ordered=true
"B"
"B"
julia> cat_relevel(cat_array, ["B", "A", "C"])
6-element CategoricalArrays.CategoricalArray{String,1,UInt32}:
"A"
"B"
"C"
"A"
"B"
"B"
julia> println(levels(cat_relevel(cat_array, ["B", "A", "C"])))
["B", "A", "C"]
julia> println(levels(cat_relevel(cat_array, ["A"], after=1)))
["B", "A", "C"]
julia> cat_array = CategoricalArray(["A", "B", "C", "A", "B", missing], ordered=true);
julia> println(levels(cat_relevel(cat_array, ["C", "A", "B", missing]), skipmissing=false))
Union{Missing, String}["C", "A", "B", missing]
```
"""

Expand Down Expand Up @@ -316,4 +318,93 @@ julia> cat_lump_prop(cat_array, 0.3)
const docstring_as_integer =
"""
Converts a CategoricalValue or CategoricalArray to an integer or vector of integers.
"""
"""
const docstring_cat_replace_missing =
"""
cat_replace_missing(cat_array::CategoricalArray, missing_level::String="missing")
Lumps infrequent levels in a categorical array into an 'other' level based on proportion threshold.
# Arguments
- `cat_array`: Categorical array to lump
- `prop`: Proportion threshold. Levels with proportions below this will be lumped.
- `other_level`: The level name to lump infrequent levels into. Default is "Other".
# Returns
Categorical array with levels lumped based on proportion.
# Examples
```jldoctest
julia> cat_array = CategoricalArray(["a", "b", missing, "a", missing, "c"]);
julia > print(cat_missing_to_lvl(cat_array))
6-element CategoricalArray{Union{Missing, String},1,UInt32}:
"a"
"b"
missing
"a"
missing
"c"
julia> print(cat_missing_to_lvl(cat_array, "unknown"))
6-element CategoricalArray{Union{Missing, String},1,UInt32}:
"a"
"b"
"unknown"
"a"
"unknown"
"c"
```
"""

const docstring_cat_recode =
"""
cat_recode(cat_array::Union{CategoricalArray, AbstractVector}; kwargs...)
Recodes the levels in a categorical array based on a provided mapping.
# Arguments
- `cat_array`: Categorical array to recode
- `kwargs`: A dictionary with the original levels as keys and the new levels as values. Levels not in the keys will be kept the same.
# Returns
Categorical array with the levels recoded.
# Examples
```jldoctest
julia> x = CategoricalArray(["apple", "tomato", "banana", "dear"]);
julia> println(levels(cat_recode(x, fruit = ["apple", "banana"], nothing = ["tomato"])))
["fruit", "nothing", "dear"]
```
"""

const docstring_cat_other =
"""
cat_other(cat_array::CategoricalArray, other_level::String="Other")
Replaces all levels in a categorical array with the 'other' level.
# Arguments
- `cat_array`: Categorical array to replace levels
- `other_level`: The level name to replace all levels with. Default is "Other".
# Returns
Categorical array with all levels replaced by the 'other' level.
# Examples
```jldoctest
julia> cat_array = CategoricalArray(["A", "B", "C", "D", "E"]);
julia> cat_other(cat_array, drop = ["A", "B"])
5-element CategoricalArrays.CategoricalArray{String,1,UInt32}:
"Other"
"Other"
"C"
"D"
"E"
```
"""

0 comments on commit 191e08e

Please sign in to comment.