Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Preserve coordinate dtypes #178

Merged
merged 5 commits into from
Jan 3, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 34 additions & 39 deletions bioframe/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,11 +274,18 @@ def _overlap_intidxs(df1, df2, how="left", cols1=None, cols2=None, on=None):
if on is not None:
group_list1 += on
group_list2 += on
df1_groups = df1.groupby(group_list1, observed=True, dropna=False).indices

df1_groups = df1.groupby(group_list1, observed=True, dropna=False).indices
df2_groups = df2.groupby(group_list2, observed=True, dropna=False).indices
all_groups = set.union(set(df1_groups), set(df2_groups))

# Extract coordinate columns
# .values will return a numpy array or pandas array, depending on the dtype
starts1 = df1[sk1].values
ends1 = df1[ek1].values
starts2 = df2[sk2].values
ends2 = df2[ek2].values

# Find overlapping intervals per group (determined by chrom and on).
overlap_intidxs = []
for group_keys in all_groups:
Expand All @@ -293,11 +300,12 @@ def _overlap_intidxs(df1, df2, how="left", cols1=None, cols2=None, on=None):
both_groups_nonempty = (df1_group_idxs.size > 0) and (df2_group_idxs.size > 0)

if both_groups_nonempty:

overlap_idxs_loc = arrops.overlap_intervals(
df1[sk1].values[df1_group_idxs],
df1[ek1].values[df1_group_idxs],
df2[sk2].values[df2_group_idxs],
df2[ek2].values[df2_group_idxs],
starts1[df1_group_idxs],
ends1[df1_group_idxs],
starts2[df2_group_idxs],
ends2[df2_group_idxs],
)

# Convert local per-chromosome indices into the
Expand Down Expand Up @@ -433,7 +441,6 @@ def overlap(
df_overlap : pandas.DataFrame

"""

ck1, sk1, ek1 = _get_default_colnames() if cols1 is None else cols1
ck2, sk2, ek2 = _get_default_colnames() if cols2 is None else cols2
checks.is_bedframe(df1, raise_errors=True, cols=[ck1, sk1, ek1])
Expand All @@ -460,13 +467,15 @@ def overlap(
cols2=cols2,
on=on,
)
events1 = overlap_df_idxs[:, 0]
events2 = overlap_df_idxs[:, 1]

# Generate output tables.
index_col = return_index if isinstance(return_index, str) else "index"
index_col_1 = index_col + suffixes[0]
index_col_2 = index_col + suffixes[1]
df_index_1 = pd.DataFrame({index_col_1: df1.index[overlap_df_idxs[:, 0]]})
df_index_2 = pd.DataFrame({index_col_2: df2.index[overlap_df_idxs[:, 1]]})
df_index_1 = pd.DataFrame({index_col_1: df1.index[events1]})
df_index_2 = pd.DataFrame({index_col_2: df2.index[events2]})

df_overlap = None
if return_overlap:
Expand All @@ -475,58 +484,44 @@ def overlap(
overlap_col_ek1 = overlap_col + "_" + ek1

overlap_start = np.maximum(
df1[sk1].values[overlap_df_idxs[:, 0]],
df2[sk2].values[overlap_df_idxs[:, 1]],
df1[sk1].values[events1],
df2[sk2].values[events2],
)

overlap_end = np.minimum(
df1[ek1].values[overlap_df_idxs[:, 0]],
df2[ek2].values[overlap_df_idxs[:, 1]],
df1[ek1].values[events1],
df2[ek2].values[events2],
)

df_overlap = pd.DataFrame(
{overlap_col_sk1: overlap_start, overlap_col_ek1: overlap_end}
{
overlap_col_sk1: overlap_start,
overlap_col_ek1: overlap_end
}
)

df_input_1 = None
df_input_2 = None
if return_input or str(return_input) == "1" or return_input == "left":
df_input_1 = df1.iloc[overlap_df_idxs[:, 0]].reset_index(drop=True)
df_input_1 = df1.iloc[events1].reset_index(drop=True)
df_input_1.columns = [c + suffixes[0] for c in df_input_1.columns]
if return_input or str(return_input) == "2" or return_input == "right":
df_input_2 = df2.iloc[overlap_df_idxs[:, 1]].reset_index(drop=True)
df_input_2 = df2.iloc[events2].reset_index(drop=True)
df_input_2.columns = [c + suffixes[1] for c in df_input_2.columns]

# Masking non-overlapping regions if using non-inner joins.
if how != "inner":
df_index_1[overlap_df_idxs[:, 0] == -1] = None
df_index_1 = df_index_1.astype({index_col_1: pd.Int64Dtype()})
df_index_2[overlap_df_idxs[:, 1] == -1] = None
df_index_2 = df_index_2.astype({index_col_2: pd.Int64Dtype()})
df_index_1[events1 == -1] = None
df_index_2[events2 == -1] = None

if df_input_1 is not None:
df_input_1[overlap_df_idxs[:, 0] == -1] = None
df_input_1 = df_input_1.astype(
{
(sk1 + suffixes[0]): pd.Int64Dtype(),
(ek1 + suffixes[0]): pd.Int64Dtype(),
}
)
df_input_1[events1 == -1] = None

if df_input_2 is not None:
df_input_2[overlap_df_idxs[:, 1] == -1] = None
df_input_2 = df_input_2.astype(
{
(sk2 + suffixes[1]): pd.Int64Dtype(),
(ek2 + suffixes[1]): pd.Int64Dtype(),
}
)
df_input_2[events2 == -1] = None

if df_overlap is not None:
df_overlap[
(overlap_df_idxs[:, 0] == -1) | (overlap_df_idxs[:, 1] == -1)
] = None
df_overlap = df_overlap.astype(
{overlap_col_sk1: pd.Int64Dtype(), (overlap_col_ek1): pd.Int64Dtype()}
)
df_overlap[(events1 == -1) | (events2 == -1)] = None

out_df = pd.concat(
[df_index_1, df_input_1, df_index_2, df_input_2, df_overlap], axis="columns"
Expand Down
65 changes: 54 additions & 11 deletions tests/test_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def test_trim():
["chrX_0", 1, 5],
],
columns=["chrom", "startFunky", "end"],
).astype({"startFunky": pd.Int64Dtype(), "end": pd.Int64Dtype()})
)
pd.testing.assert_frame_equal(
df_trimmed,
bioframe.trim(
Expand Down Expand Up @@ -522,6 +522,58 @@ def test_overlap():
)


def test_overlap_preserves_coord_dtypes():
df1 = pd.DataFrame(
[
["chr1", 8, 12, "+"],
["chr1", 7, 10, "-"],
["chrX", 1, 8, "+"],
],
columns=["chrom", "start", "end", "strand"],
).astype({"start": np.uint32, "end": np.uint32})
df2 = pd.DataFrame(
[
["chr1", 6, 10, "+"],
[pd.NA, pd.NA, pd.NA, "-"],
["chrX", 7, 10, "-"]
],
columns=["chrom", "start", "end", "strand"],
).astype({"start": pd.Int64Dtype(), "end": pd.Int64Dtype()})

# inner join - left keeps non-nullable numpy uint32
overlap_dtypes = bioframe.overlap(df1, df2, how="inner").dtypes
assert (df1.dtypes == overlap_dtypes[:4]).all()
assert (df2.dtypes == overlap_dtypes[4:].rename(lambda x: x.replace("_", ""))).all()

# outer join - left uint32 gets cast to numpy float64
overlap_dtypes = bioframe.overlap(df1, df2, how="outer").dtypes
assert overlap_dtypes["start"] == np.float64
assert overlap_dtypes["end"] == np.float64
assert overlap_dtypes["start_"] == pd.Int64Dtype()
assert overlap_dtypes["end_"] == pd.Int64Dtype()

# convert left coords to nullable *before* joining
overlap_dtypes = bioframe.overlap(df1.convert_dtypes(), df2, how="inner").dtypes
assert overlap_dtypes["start"] == pd.UInt32Dtype()
assert overlap_dtypes["end"] == pd.UInt32Dtype()
assert overlap_dtypes["start_"] == pd.Int64Dtype()
assert overlap_dtypes["end_"] == pd.Int64Dtype()

# convert coords to nullable *after* joining
# inner join - uint32 output becomes UInt32
# outer join - float64 output becomes Int64
overlap_dtypes = bioframe.overlap(df1, df2, how="inner").convert_dtypes().dtypes
assert overlap_dtypes["start"] == pd.UInt32Dtype()
assert overlap_dtypes["end"] == pd.UInt32Dtype()
assert overlap_dtypes["start_"] == pd.Int64Dtype()
assert overlap_dtypes["end_"] == pd.Int64Dtype()
overlap_dtypes = bioframe.overlap(df1, df2, how="outer").convert_dtypes().dtypes
assert overlap_dtypes["start"] == pd.Int64Dtype()
assert overlap_dtypes["end"] == pd.Int64Dtype()
assert overlap_dtypes["start_"] == pd.Int64Dtype()
assert overlap_dtypes["end_"] == pd.Int64Dtype()


def test_cluster():
df1 = pd.DataFrame(
[
Expand Down Expand Up @@ -1575,9 +1627,6 @@ def test_assign_view():
],
columns=["chrom", "start", "end", "strand", "view_region"],
)
df_assigned = df_assigned.astype(
{"chrom": str, "start": pd.Int64Dtype(), "end": pd.Int64Dtype()}
)
pd.testing.assert_frame_equal(df_assigned, bioframe.assign_view(df, view_df))

# non-default columns in view
Expand Down Expand Up @@ -1620,9 +1669,6 @@ def test_assign_view():
],
columns=["chrom", "start", "end", "strand", "funny_view_region"],
)
df_assigned = df_assigned.astype(
{"chrom": str, "start": pd.Int64Dtype(), "end": pd.Int64Dtype()}
)

pd.testing.assert_frame_equal(
df_assigned,
Expand All @@ -1641,13 +1687,10 @@ def test_assign_view():
["chr1", 0, 10, "+", "apples"],
["chrX", 5, 10, "+", "oranges"],
["chrX", 0, 5, "+", "oranges"],
["chr2", 5, 10, "+", pd.NA],
["chr2", 5, 10, "+", None],
],
columns=["chrom", "start", "end", "strand", "funny_view_region"],
)
df_assigned = df_assigned.astype(
{"chrom": str, "start": pd.Int64Dtype(), "end": pd.Int64Dtype()}
)

pd.testing.assert_frame_equal(
df_assigned,
Expand Down