From 06a85be56b4654c4a8a246d649a11f771bd46904 Mon Sep 17 00:00:00 2001 From: Maren Buettner Date: Tue, 22 Aug 2023 11:22:25 -0700 Subject: [PATCH] :fire: add chromosome scaffold filtering --- pycisTopic/pseudobulk_peak_calling.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pycisTopic/pseudobulk_peak_calling.py b/pycisTopic/pseudobulk_peak_calling.py index 7d6fdd1..6289b16 100644 --- a/pycisTopic/pseudobulk_peak_calling.py +++ b/pycisTopic/pseudobulk_peak_calling.py @@ -23,6 +23,7 @@ def export_pseudobulk( bed_path: str, bigwig_path: str, path_to_fragments: Optional[Dict[str, str]] = None, + chrom_filter: Optional[str] = None, sample_id_col: Optional[str] = "sample_id", n_cpu: Optional[int] = 1, normalize_bigwig: Optional[bool] = True, @@ -55,6 +56,9 @@ def export_pseudobulk( A dictionary of character strings, with sample name as names indicating the path to the fragments file/s from which pseudobulk profiles have to be created. If a :class:`CistopicObject` is provided as input it will be ignored, but if a cell metadata :class:`pd.DataFrame` is provided it is necessary to provide it. The keys of the dictionary need to match with the sample_id tag added to the index names of the input data frame. + chrom_filter: str, optional + A regular expression to filter out scaffolds like GL/KI genes from the fragments list. + Example: `"GL|KI"` sample_id_col: str, optional Name of the column containing the sample name per barcode in the input :class:`CistopicObject.cell_data` or class:`pd.DataFrame`. Default: 'sample_id'. n_cpu: int, optional @@ -129,6 +133,12 @@ def export_pseudobulk( prepare_tag_cells(cell_data.index.tolist(), split_pattern) ) ] + if chrom_filter is not None: + fragment_drop = fragments_df.Chromosome.str.contains(chrom_filter) + n_fragments_dropped = fragment_drop.sum() + log.info("Filtering out " + str(n_fragments_dropped) + " fragments.") + fragments_df.drop(fragments_df[fragment_drop].index, inplace=True) + fragments_df_dict[sample_id] = fragments_df # Set groups