diff --git a/bioframe/ops.py b/bioframe/ops.py index bd2e1c6..6a59e2c 100644 --- a/bioframe/ops.py +++ b/bioframe/ops.py @@ -274,11 +274,18 @@ def _overlap_intidxs(df1, df2, how="left", cols1=None, cols2=None, on=None): if on is not None: group_list1 += on group_list2 += on - df1_groups = df1.groupby(group_list1, observed=True, dropna=False).indices + df1_groups = df1.groupby(group_list1, observed=True, dropna=False).indices df2_groups = df2.groupby(group_list2, observed=True, dropna=False).indices all_groups = set.union(set(df1_groups), set(df2_groups)) + # Extract coordinate columns + # .values will return a numpy array or pandas array, depending on the dtype + starts1 = df1[sk1].values + ends1 = df1[ek1].values + starts2 = df2[sk2].values + ends2 = df2[ek2].values + # Find overlapping intervals per group (determined by chrom and on). overlap_intidxs = [] for group_keys in all_groups: @@ -293,11 +300,12 @@ def _overlap_intidxs(df1, df2, how="left", cols1=None, cols2=None, on=None): both_groups_nonempty = (df1_group_idxs.size > 0) and (df2_group_idxs.size > 0) if both_groups_nonempty: + overlap_idxs_loc = arrops.overlap_intervals( - df1[sk1].values[df1_group_idxs], - df1[ek1].values[df1_group_idxs], - df2[sk2].values[df2_group_idxs], - df2[ek2].values[df2_group_idxs], + starts1[df1_group_idxs], + ends1[df1_group_idxs], + starts2[df2_group_idxs], + ends2[df2_group_idxs], ) # Convert local per-chromosome indices into the @@ -377,6 +385,7 @@ def overlap( cols1=None, cols2=None, on=None, + ensure_nullable=False, ): """ Find pairs of overlapping genomic intervals. @@ -428,17 +437,46 @@ def overlap( when considering overlaps. A common use would be passing on=['strand']. Default is None. + ensure_nullable : bool + If True, ensures that the output dataframe uses nullable Pandas + integer dtypes for start and end coordinates. This may involve + converting coordinate columns in the input dataframes. + Default False. + Returns ------- df_overlap : pandas.DataFrame + Notes + ----- + By default, the dtypes of the `start` and `end` coordinate columns + returned in the output dataframe are preserved from the input dataframes, + following native type casting rules if missing data are introduced. + + This means, for example, that if `df1` uses a NumPy integer dtype for + `start` and/or `end`, the output dataframe will use the same dtype after + an inner join, but, due to casting rules, may produce ``float64`` after a + left/right/outer join with missing data stored as ``NaN``. On the other + hand, if `df1` uses Pandas nullable dtypes, the corresponding coordinate + columns will preserve the same dtype in the output, with missing data + stored as ``NA``. If ``ensure_nullable`` is True, the output dataframe will + always return Pandas nullable dtypes for start and end coordinates. """ - ck1, sk1, ek1 = _get_default_colnames() if cols1 is None else cols1 ck2, sk2, ek2 = _get_default_colnames() if cols2 is None else cols2 checks.is_bedframe(df1, raise_errors=True, cols=[ck1, sk1, ek1]) checks.is_bedframe(df2, raise_errors=True, cols=[ck2, sk2, ek2]) + if ensure_nullable: + df1 = df1.assign(**{ + sk1: df1[sk1].convert_dtypes(), + ek1: df1[ek1].convert_dtypes(), + }) + df2 = df2.assign(**{ + sk2: df2[sk2].convert_dtypes(), + ek2: df2[ek2].convert_dtypes(), + }) + if (how == "left") and (keep_order is None): keep_order = True if (how != "left") and keep_order: @@ -460,13 +498,15 @@ def overlap( cols2=cols2, on=on, ) + events1 = overlap_df_idxs[:, 0] + events2 = overlap_df_idxs[:, 1] # Generate output tables. index_col = return_index if isinstance(return_index, str) else "index" index_col_1 = index_col + suffixes[0] index_col_2 = index_col + suffixes[1] - df_index_1 = pd.DataFrame({index_col_1: df1.index[overlap_df_idxs[:, 0]]}) - df_index_2 = pd.DataFrame({index_col_2: df2.index[overlap_df_idxs[:, 1]]}) + df_index_1 = pd.DataFrame({index_col_1: df1.index[events1]}) + df_index_2 = pd.DataFrame({index_col_2: df2.index[events2]}) df_overlap = None if return_overlap: @@ -475,58 +515,44 @@ def overlap( overlap_col_ek1 = overlap_col + "_" + ek1 overlap_start = np.maximum( - df1[sk1].values[overlap_df_idxs[:, 0]], - df2[sk2].values[overlap_df_idxs[:, 1]], + df1[sk1].values[events1], + df2[sk2].values[events2], ) overlap_end = np.minimum( - df1[ek1].values[overlap_df_idxs[:, 0]], - df2[ek2].values[overlap_df_idxs[:, 1]], + df1[ek1].values[events1], + df2[ek2].values[events2], ) df_overlap = pd.DataFrame( - {overlap_col_sk1: overlap_start, overlap_col_ek1: overlap_end} + { + overlap_col_sk1: overlap_start, + overlap_col_ek1: overlap_end + } ) df_input_1 = None df_input_2 = None if return_input or str(return_input) == "1" or return_input == "left": - df_input_1 = df1.iloc[overlap_df_idxs[:, 0]].reset_index(drop=True) + df_input_1 = df1.iloc[events1].reset_index(drop=True) df_input_1.columns = [c + suffixes[0] for c in df_input_1.columns] if return_input or str(return_input) == "2" or return_input == "right": - df_input_2 = df2.iloc[overlap_df_idxs[:, 1]].reset_index(drop=True) + df_input_2 = df2.iloc[events2].reset_index(drop=True) df_input_2.columns = [c + suffixes[1] for c in df_input_2.columns] # Masking non-overlapping regions if using non-inner joins. if how != "inner": - df_index_1[overlap_df_idxs[:, 0] == -1] = None - df_index_1 = df_index_1.astype({index_col_1: pd.Int64Dtype()}) - df_index_2[overlap_df_idxs[:, 1] == -1] = None - df_index_2 = df_index_2.astype({index_col_2: pd.Int64Dtype()}) + df_index_1[events1 == -1] = None + df_index_2[events2 == -1] = None if df_input_1 is not None: - df_input_1[overlap_df_idxs[:, 0] == -1] = None - df_input_1 = df_input_1.astype( - { - (sk1 + suffixes[0]): pd.Int64Dtype(), - (ek1 + suffixes[0]): pd.Int64Dtype(), - } - ) + df_input_1[events1 == -1] = None + if df_input_2 is not None: - df_input_2[overlap_df_idxs[:, 1] == -1] = None - df_input_2 = df_input_2.astype( - { - (sk2 + suffixes[1]): pd.Int64Dtype(), - (ek2 + suffixes[1]): pd.Int64Dtype(), - } - ) + df_input_2[events2 == -1] = None + if df_overlap is not None: - df_overlap[ - (overlap_df_idxs[:, 0] == -1) | (overlap_df_idxs[:, 1] == -1) - ] = None - df_overlap = df_overlap.astype( - {overlap_col_sk1: pd.Int64Dtype(), (overlap_col_ek1): pd.Int64Dtype()} - ) + df_overlap[(events1 == -1) | (events2 == -1)] = None out_df = pd.concat( [df_index_1, df_input_1, df_index_2, df_input_2, df_overlap], axis="columns" diff --git a/tests/test_ops.py b/tests/test_ops.py index 0a753ba..17d3d44 100644 --- a/tests/test_ops.py +++ b/tests/test_ops.py @@ -130,7 +130,7 @@ def test_trim(): ["chrX_0", 1, 5], ], columns=["chrom", "startFunky", "end"], - ).astype({"startFunky": pd.Int64Dtype(), "end": pd.Int64Dtype()}) + ) pd.testing.assert_frame_equal( df_trimmed, bioframe.trim( @@ -522,6 +522,98 @@ def test_overlap(): ) +def test_overlap_preserves_coord_dtypes(): + df1 = pd.DataFrame( + [ + ["chr1", 8, 12, "+"], + ["chr1", 7, 10, "-"], + ["chrX", 1, 8, "+"], + ], + columns=["chrom", "start", "end", "strand"], + ).astype({"start": np.uint32, "end": np.uint32}) + df2 = pd.DataFrame( + [ + ["chr1", 6, 10, "+"], + [pd.NA, pd.NA, pd.NA, "-"], + ["chrX", 7, 10, "-"] + ], + columns=["chrom", "start", "end", "strand"], + ).astype({"start": pd.Int64Dtype(), "end": pd.Int64Dtype()}) + + # inner join - left keeps non-nullable numpy uint32 + overlap_dtypes = bioframe.overlap(df1, df2, how="inner").dtypes + overlap_dtypes = bioframe.overlap(df1, df2, how="inner").dtypes + for col in ["start", "end"]: + assert overlap_dtypes[col] == np.uint32 + for col in ["start_", "end_"]: + assert overlap_dtypes[col] == pd.Int64Dtype() + + # outer join - left uint32 gets cast to numpy float64 + overlap_dtypes = bioframe.overlap(df1, df2, how="outer").dtypes + assert overlap_dtypes["start"] == np.float64 + assert overlap_dtypes["end"] == np.float64 + assert overlap_dtypes["start_"] == pd.Int64Dtype() + assert overlap_dtypes["end_"] == pd.Int64Dtype() + + # convert left coords to nullable *before* joining + overlap_dtypes = bioframe.overlap(df1.convert_dtypes(), df2, how="inner").dtypes + assert overlap_dtypes["start"] == pd.UInt32Dtype() + assert overlap_dtypes["end"] == pd.UInt32Dtype() + assert overlap_dtypes["start_"] == pd.Int64Dtype() + assert overlap_dtypes["end_"] == pd.Int64Dtype() + + # convert coords to nullable *after* joining + # inner join - uint32 output becomes UInt32 + # outer join - float64 output becomes Int64 + overlap_dtypes = bioframe.overlap(df1, df2, how="inner").convert_dtypes().dtypes + assert overlap_dtypes["start"] == pd.UInt32Dtype() + assert overlap_dtypes["end"] == pd.UInt32Dtype() + assert overlap_dtypes["start_"] == pd.Int64Dtype() + assert overlap_dtypes["end_"] == pd.Int64Dtype() + overlap_dtypes = bioframe.overlap(df1, df2, how="outer").convert_dtypes().dtypes + assert overlap_dtypes["start"] == pd.Int64Dtype() + assert overlap_dtypes["end"] == pd.Int64Dtype() + assert overlap_dtypes["start_"] == pd.Int64Dtype() + assert overlap_dtypes["end_"] == pd.Int64Dtype() + + +def test_overlap_ensure_nullable_coords(): + df1 = pd.DataFrame( + [ + ["chr1", 8, 12, "+"], + ["chr1", 7, 10, "-"], + ["chrX", 1, 8, "+"], + ], + columns=["chrom", "start", "end", "strand"], + ).astype({"start": np.uint32, "end": np.uint32}) + df2 = pd.DataFrame( + [ + ["chr1", 6, 10, "+"], + [pd.NA, pd.NA, pd.NA, "-"], + ["chrX", 7, 10, "-"] + ], + columns=["chrom", "start", "end", "strand"], + ).astype({"start": pd.Int64Dtype(), "end": pd.Int64Dtype()}) + + # inner join - left uint32 gets cast to UInt32 + overlap_dtypes = bioframe.overlap( + df1, df2, how="inner", ensure_nullable=True + ).dtypes + for col in ["start", "end"]: + assert overlap_dtypes[col] == pd.UInt32Dtype() + for col in ["start_", "end_"]: + assert overlap_dtypes[col] == pd.Int64Dtype() + + # outer join - left uint32 gets cast to UInt32 before the join + overlap_dtypes = bioframe.overlap( + df1, df2, how="outer", ensure_nullable=True + ).dtypes + for col in ["start", "end"]: + assert overlap_dtypes[col] == pd.UInt32Dtype() + for col in ["start_", "end_"]: + assert overlap_dtypes[col] == pd.Int64Dtype() + + def test_cluster(): df1 = pd.DataFrame( [ @@ -1575,9 +1667,6 @@ def test_assign_view(): ], columns=["chrom", "start", "end", "strand", "view_region"], ) - df_assigned = df_assigned.astype( - {"chrom": str, "start": pd.Int64Dtype(), "end": pd.Int64Dtype()} - ) pd.testing.assert_frame_equal(df_assigned, bioframe.assign_view(df, view_df)) # non-default columns in view @@ -1620,9 +1709,6 @@ def test_assign_view(): ], columns=["chrom", "start", "end", "strand", "funny_view_region"], ) - df_assigned = df_assigned.astype( - {"chrom": str, "start": pd.Int64Dtype(), "end": pd.Int64Dtype()} - ) pd.testing.assert_frame_equal( df_assigned, @@ -1641,13 +1727,10 @@ def test_assign_view(): ["chr1", 0, 10, "+", "apples"], ["chrX", 5, 10, "+", "oranges"], ["chrX", 0, 5, "+", "oranges"], - ["chr2", 5, 10, "+", pd.NA], + ["chr2", 5, 10, "+", None], ], columns=["chrom", "start", "end", "strand", "funny_view_region"], ) - df_assigned = df_assigned.astype( - {"chrom": str, "start": pd.Int64Dtype(), "end": pd.Int64Dtype()} - ) pd.testing.assert_frame_equal( df_assigned,