From e2b5bb7447bb7f6aa96420c42b88df3344ebaf4f Mon Sep 17 00:00:00 2001 From: sichao Date: Wed, 7 Feb 2024 15:54:49 -0500 Subject: [PATCH 1/3] update docstr and typing for external --- dynamo/external/gseapy.py | 28 ++---- dynamo/external/hodge.py | 66 ++++++-------- dynamo/external/scifate.py | 168 +++++++++++++++++++++++++---------- dynamo/external/scribe.py | 173 ++++++++++++++++--------------------- dynamo/external/utils.py | 36 +++++++- 5 files changed, 267 insertions(+), 204 deletions(-) diff --git a/dynamo/external/gseapy.py b/dynamo/external/gseapy.py index 36f7a4ba3..bdd1d1902 100644 --- a/dynamo/external/gseapy.py +++ b/dynamo/external/gseapy.py @@ -17,27 +17,17 @@ def enrichr( ): """Perform gene list enrichment with gseapy. - Parameters - ---------- - genes: - Flat file with list of genes, one gene id per row, or a python list object. - organism: - Enrichr supported organism. Select from (human, mouse, yeast, fly, fish, worm). + Args: + genes: Flat file with list of genes, one gene id per row, or a python list object. + organism: Enrichr supported organism. Select from (human, mouse, yeast, fly, fish, worm). see here for details: https://amp.pharm.mssm.edu/modEnrichr - :param gene_sets: - gene_sets: - str, list, tuple of Enrichr Library name(s). - description: - name of analysis. optional. - outdir: - Output file directory - cutoff: - Show enriched terms which Adjusted P-value < cutoff. Only affects the output figure. Default: 0.05 - kwargs: - additional arguments passed to the `gp.enrichr` function. + gene_sets: str, list, tuple of Enrichr Library name(s). + description: Name of analysis. optional. + outdir: Output file directory + cutoff: Show enriched terms which Adjusted P-value < cutoff. Only affects the output figure. Default: 0.05 + kwargs: Additional arguments passed to the `gp.enrichr` function. - Returns - ------- + Returns: An Enrichr object, which obj.res2d stores your last query, obj.results stores your all queries. >>> import dynamo as dyn diff --git a/dynamo/external/hodge.py b/dynamo/external/hodge.py index ac75d44c9..778094b80 100644 --- a/dynamo/external/hodge.py +++ b/dynamo/external/hodge.py @@ -45,50 +45,36 @@ def ddhodge( reconstructed vector field function. This method is relevant to the curl-free/divergence-free vector field reconstruction. - Parameters - ---------- - adata: :class:`~anndata.AnnData` - an Annodata object. - X_data: - The user supplied expression (embedding) data that will be used for graph hodege decomposition directly. - layer: - Which layer of the data will be used for graph Hodge decomposition. - basis: - Which basis of the data will be used for graph Hodge decomposition. - n: - Number of nearest neighbors when the nearest neighbor graph is not included. - VecFld: - The reconstructed vector field function. - adjmethod: - The method to build the ajacency matrix that will be used to create the sparse diffusion graph, can be + Args: + adata: An Anndata object. + X_data: The user supplied expression (embedding) data that will be used for graph hodege decomposition directly. + layer: Which layer of the data will be used for graph Hodge decomposition. + basis: Which basis of the data will be used for graph Hodge decomposition. + n: Number of nearest neighbors when the nearest neighbor graph is not included. + VecFld: The reconstructed vector field function. + adjmethod: The method to build the ajacency matrix that will be used to create the sparse diffusion graph, can be either "naive" or "graphize_vecfld". If "naive" used, the transition_matrix that created during vector field projection will be used; if "graphize_vecfld" used, a method that guarantees the preservance of divergence will be used. - n_downsamples: - Number of cells to downsample to if the cell number is large than this value. Three downsampling methods are - available, see `sampling_method`. - up_sampling: - Whether to assign calculated potential, curl and divergence to cells not sampled based on values from their - nearest sampled cells. - sampling_method: - Methods to downsample datasets to facilitate calculation. Can be one of {`random`, `velocity`, `trn`}, each - corresponds to random sampling, velocity magnitude based and topology representing network based sampling. - seed: - Seed for RandomState. Must be convertible to 32 bit unsigned integers. Used in sampling control points. + n_downsamples: Number of cells to downsample to if the cell number is large than this value. Three downsampling + methods are available, see `sampling_method`. + up_sampling: Whether to assign calculated potential, curl and divergence to cells not sampled based on values + from their nearest sampled cells. + sampling_method: Methods to downsample datasets to facilitate calculation. Can be one of {`random`, `velocity`, + `trn`}, each corresponds to random sampling, velocity magnitude based and topology representing network + based sampling. + seed: Seed for RandomState. Must be convertible to 32 bit unsigned integers. Used in sampling control points. Default is to be 0 for ensure consistency between different runs. - enforce: - Whether to enforce the calculation of adjacency matrix for estimating potential, curl, divergence for each - cell. - cores: - Number of cores to run the graphize_vecfld function. If cores is set to be > 1, multiprocessing will be used - to parallel the graphize_vecfld calculation. - - Returns - ------- - adata: :class:`~anndata.AnnData` - `AnnData` object that is updated with the `ddhodge` key in the `obsp` attribute which to adjacency matrix - that corresponds to the sparse diffusion graph. Two columns `potential` and `divergence` corresponds to the - potential and divergence for each cell will also be added.""" + enforce: Whether to enforce the calculation of adjacency matrix for estimating potential, curl, divergence for + each cell. + cores: Number of cores to run the graphize_vecfld function. If cores is set to be > 1, multiprocessing will be + used to parallel the graphize_vecfld calculation. + + Returns: + `AnnData` object that is updated with the `ddhodge` key in the `obsp` attribute which to adjacency matrix + that corresponds to the sparse diffusion graph. Two columns `potential` and `divergence` corresponds to the + potential and divergence for each cell will also be added. + """ main_log_time() prefix = "" if basis is None else basis + "_" diff --git a/dynamo/external/scifate.py b/dynamo/external/scifate.py index 5f15db3c3..98aad31e0 100644 --- a/dynamo/external/scifate.py +++ b/dynamo/external/scifate.py @@ -2,7 +2,7 @@ # the following code is based on Cao, et. al, Nature Biotechnology, 2020 and # https://github.com/JunyueC/sci-fate_analysis -from typing import Union +from typing import Dict import numpy as np import pandas as pd @@ -25,7 +25,9 @@ def scifate_glmnet( TF_link_ENCODE_ref: str = "https://www.dropbox.com/s/bjuope41pte7mf4/df_gene_TF_link_ENCODE.csv?dl=1", nt_layers: list = ["X_new", "X_total"], ) -> AnnData: - """Reconstruction of regulatory network (Cao, et. al, Nature Biotechnology, 2020) from TFs to other target + """Perform scifate analysis using glmnet. + + Reconstruction of regulatory network (Cao, et. al, Nature Biotechnology, 2020) from TFs to other target genes via LASSO regression between the total expression of known transcription factors and the newly synthesized RNA of potential targets. The inferred regulatory relationships between TF and targets are further filtered based on evidence of promoter motifs (not implemented currently) and the ENCODE chip-seq peaks. The python wrapper for the @@ -33,51 +35,43 @@ def scifate_glmnet( with glm-python can be found here (https://github.com/bbalasub1/glmnet_python/blob/master/test/glmnet_examples.ipynb). Note that this function can be applied to both of the metabolic labeling single-cell assays with newly synthesized and total RNA as well as the regular single cell assays with both the unspliced and spliced transcripts. Furthermore, - you can also replace the either the new or unspliced RNA with dynamo estimated cell-wise velocity, transcription, + you can also replace either the new or unspliced RNA with dynamo estimated cell-wise velocity, transcription, splicing and degradation rates for each gene (similarly, replacing the expression values of transcription factors with RNA binding, ribosome, epigenetics or epitranscriptomic factors, etc.) to infer the tottal regulatory effects, transcription, splicing and post-transcriptional regulation of different factors. In addition, this approach will be fully integrated with Scribe (Qiu, et. al, 2020) which employs restricted directed information to determine causality by estimating the - strength of information transferred from a potential regulator to its downstream target. In contrast of lasso regression, + strength of information transferred from a potential regulator to its downstream target. In contrast, of lasso regression, Scribe can learn both linear and non-linear causality in deterministic and stochastic systems. It also incorporates rigorous procedures to alleviate sampling bias and builds upon novel estimators and regularization techniques to facilitate inference of large-scale causal networks. - Parameters - ---------- - adata: :class:`~anndata.AnnData`. - adata object that includes both newly synthesized and total gene expression of cells. Alternatively, + Args: + adata: Adata object that includes both newly synthesized and total gene expression of cells. Alternatively, the object should include both unspliced and spliced gene expression of cells. - gene_filter_rate: - minimum percentage of expressed cells for gene filtering. - cell_filter_UMI: - minimum number of UMIs for cell filtering. - core_n_lasso: - number of cores for lasso regression in linkage analysis. By default, it is 1 and parallel is turned off. - Parallel computing can significantly speed up the computation process, especially for datasets involve - many cells or genes. But for smaller datasets or genes, it could result in a reduction in speed due to the - additional overhead. User discretion is advised. - core_n_filtering: - number of cores for filtering TF-gene links. Not used currently. - motif_ref: - The path to the TF binding motif data as described above. It provides the list of TFs gene names and + gene_filter_rate: Minimum percentage of expressed cells for gene filtering. + cell_filter_UMI: Minimum number of UMIs for cell filtering. + core_n_lasso: Number of cores for lasso regression in linkage analysis. By default, it is 1 and parallel is + turned off. Parallel computing can significantly speed up the computation process, especially for datasets + involve many cells or genes. But for smaller datasets or genes, it could result in a reduction in speed due + to the additional overhead. User discretion is advised. + core_n_filtering: Number of cores for filtering TF-gene links. Not used currently. + motif_ref: The path to the TF binding motif data as described above. It provides the list of TFs gene names and is used to process adata object to generate the TF expression and target new expression matrix for glmnet based TF-target synthesis rate linkage analysis. But currently it is not used for motif based filtering. - By default it is a dropbox link that store the data from us. Other motif reference can bed downloaded from RcisTarget: - https://resources.aertslab.org/cistarget/. For human motif matrix, it can be downloaded from June's shared folder: - https://shendure-web.gs.washington.edu/content/members/cao1025/public/nobackup/sci_fate/data/hg19-tss-centered-10kb-7species.mc9nr.feather - TF_link_ENCODE_ref: - The path to the TF chip-seq data. By default it is a dropbox link from us that stores the data. Other data can - be downloaded from: https://amp.pharm.mssm.edu/Harmonizome/dataset/ENCODE+Transcription+Factor+Targets. - nt_layers: - The layers that will be used for the network inference. Note that the layers can be changed flexibly. See - the description of this function above. + By default, it is a dropbox link that store the data from us. Other motif reference can bed downloaded from + RcisTarget: https://resources.aertslab.org/cistarget/. For human motif matrix, it can be downloaded from + June's shared folder: + https://shendure-web.gs.washington.edu/content/members/cao1025/public/nobackup/sci_fate/data/hg19-tss-centered-10kb-7species.mc9nr.feather + TF_link_ENCODE_ref: The path to the TF chip-seq data. By default, it is a dropbox link from us that stores the + data. Other data can be downloaded from: + https://amp.pharm.mssm.edu/Harmonizome/dataset/ENCODE+Transcription+Factor+Targets. + nt_layers: The layers that will be used for the network inference. Note that the layers can be changed flexibly. + See the description of this function above. Note that if your internet connection is slow, we recommend to download the `motif_ref` and `TF_link_ENCODE_ref` and supplies those two arguments with the local paths where the downloaded datasets are saved. - Returns - ------- + Returns: An updated adata object with a new key `scifate` in .uns attribute, which stores the raw lasso regression results and the filter results after applying the Fisher exact test of the ChIP-seq peaks. """ @@ -120,7 +114,7 @@ def scifate_glmnet( # filtering the links using TF-gene binding data and store the result in the target folder # note that currently the motif filtering is not implement - df_gene_TF_link_chip = TF_gene_filter_links(link_result, var, core_n_filtering, motif_ref, df_gene_TF_link_ENCODE) + df_gene_TF_link_chip = TF_gene_filter_links(link_result, var, df_gene_TF_link_ENCODE) adata.uns["scifate"] = { "glmnet_res": link_result, @@ -137,7 +131,25 @@ def adata_processing_TF_link( gene_filter_rate: float = 0.1, cell_filter_UMI: int = 10000, ) -> tuple: - """preprocess adata and get ready for TF-target gene analysis""" + """Preprocess adata and get ready for TF-target gene analysis. + + Args: + adata: Adata object that includes both newly synthesized and total gene expression of cells. Alternatively, + the object should include both unspliced and spliced gene expression of cells. + nt_layers: The layers that will be used for the network inference. Note that the layers can be changed flexibly. + TF_list: The list of transcription factors. + gene_filter_rate: Minimum percentage of expressed cells for gene filtering. + cell_filter_UMI: Minimum number of UMIs for cell filtering. + + Returns: + A tuple of the following: + - tot_mat: The total gene expression matrix. + - new_mat: The newly synthesized gene expression matrix. + - obs: The observation dataframe. + - var: The variable dataframe. + - var_TF: The variable matrix for transcription factors. + - TF_matrix: The matrix of transcription factors. + """ n_obs, n_var = adata.n_obs, adata.n_vars @@ -217,7 +229,22 @@ def link_TF_gene_analysis( cor_thresh: float = 0.03, seed: int = 123456, ) -> list: - """Perform lasso regression for each gene.""" + """Perform lasso regression for each gene. + + Args: + TF_matrix: The dataframe of transcription factors. + gene_matrix: The dataframe of genes. + var_TF: The variable dataframe for transcription factors. + core_num: Number of cores for lasso regression in linkage analysis. By default, it is 1 and parallel is + turned off. Parallel computing can significantly speed up the computation process, especially for datasets + involve many cells or genes. But for smaller datasets or genes, it could result in a reduction in speed due + to the additional overhead. User discretion is advised. + cor_thresh: The threshold for the correlation. + seed: The random seed. + + Returns: + A list of the results of the lasso regression for each gene. + """ gene_list = gene_matrix.index link_result = [None] * len(gene_list) @@ -246,12 +273,27 @@ def link_TF_gene_analysis( def TF_gene_link( TF_matrix: pd.DataFrame, linked_gene: str, - linked_gene_expr_vector, + linked_gene_expr_vector: pd.Series, core_num: int = 1, cor_thresh: float = 0.03, seed: int = 123456, ): - """Estimate the regulatory weight of each TF to its potential targets via lasso regression for each gene.""" + """Estimate the regulatory weight of each TF to its potential targets via lasso regression for each gene. + + Args: + TF_matrix: The dataframe of transcription factors. + linked_gene: The name of the linked gene. + linked_gene_expr_vector: The expression vector of the linked gene. + core_num: Number of cores for lasso regression in linkage analysis. By default, it is 1 and parallel is + turned off. Parallel computing can significantly speed up the computation process, especially for datasets + involve many cells or genes. But for smaller datasets or genes, it could result in a reduction in speed due + to the additional overhead. User discretion is advised. + cor_thresh: The threshold for the correlation. + seed: The random seed. + + Returns: + The result of the lasso regression for the gene. + """ expr_cor = einsum_correlation(TF_matrix.values, linked_gene_expr_vector.values)[0] select_sites = abs(expr_cor) > cor_thresh @@ -266,8 +308,25 @@ def TF_gene_link( return "unknown" -def lasso_regression_expression(x1, linked_gene, y, seed: int, parallel=1): - """Lasso linear model with iterative fitting along a regularization path. Select best model is by cross-validation.""" +def lasso_regression_expression( + x1: pd.DataFrame, + linked_gene: str, + y: pd.Series, + seed: int, + parallel: int = 1, +) -> pd.DataFrame: + """Lasso linear model with iterative fitting along a regularization path. Select best model is by cross-validation. + + Args: + x1: The dataframe of the input matrix. + linked_gene: The name of the linked gene. + y: The expression vector of the linked gene. + seed: The random seed. + parallel: The number of cores for parallel computing. + + Returns: + The result of the lasso regression. + """ x1 = x1.loc[:, ~x1.columns.duplicated(keep="first")] @@ -304,9 +363,17 @@ def lasso_regression_expression(x1, linked_gene, y, seed: int, parallel=1): return df_cor -def r2_glmnet(cv_out, y): - """calculate r2 using the lambda_1se. This value is for the most regularized model whose mean squared error is - within one standard error of the minimal.""" +def r2_glmnet(cv_out: Dict, y: pd.Series) -> float: + """Calculate r2 using the lambda_1se. This value is for the most regularized model whose mean squared error is + within one standard error of the minimal. + + Args: + cv_out: The output of the cross-validation. + y: The expression vector of the linked gene. + + Returns: + The r2 value. + """ # https://stackoverflow.com/questions/50610895/how-to-calculate-r-squared-value-for-lasso-regression-using-glmnet-in-r bestlam = cv_out["lambda_1se"] @@ -319,8 +386,21 @@ def r2_glmnet(cv_out, y): return r2[0] -def TF_gene_filter_links(raw_glmnet_res: pd.DataFrame, var, core_n, motif_ref, df_gene_TF_link_ENCODE): - """prepare data for TF-target gene link filtering""" +def TF_gene_filter_links( + raw_glmnet_res: pd.DataFrame, + var: pd.DataFrame, + df_gene_TF_link_ENCODE: pd.DataFrame, +) -> pd.DataFrame: + """Prepare data for TF-target gene link filtering. + + Args: + raw_glmnet_res: The raw result of glmnet. + var: The variable dataframe. + df_gene_TF_link_ENCODE: The dataframe of the TF chip-seq data. + + Returns: + The filtered result of the lasso regression. + """ var_ori = var.copy() if "gene_type" not in var.columns: diff --git a/dynamo/external/scribe.py b/dynamo/external/scribe.py index b9e5e4b13..8a3f2a0d2 100644 --- a/dynamo/external/scribe.py +++ b/dynamo/external/scribe.py @@ -1,6 +1,6 @@ import itertools from multiprocessing.dummy import Pool as ThreadPool -from typing import Union +from typing import List, Optional, Union import numpy as np import pandas as pd @@ -27,59 +27,48 @@ def scribe( TF_link_ENCODE_ref: str = "https://www.dropbox.com/s/bjuope41pte7mf4/df_gene_TF_link_ENCODE.csv?dl=1", ) -> AnnData: """Apply Scribe to calculate causal network from spliced/unspliced, metabolic labeling based and other "real" time - series datasets. Note that this function can be applied to both of the metabolic labeling based single-cell assays + series datasets. + + Note that this function can be applied to both of the metabolic labeling based single-cell assays with newly synthesized and total RNA as well as the regular single cell assays with both the unspliced and spliced - transcripts. Furthermore, you can also replace the either the new or unspliced RNA with dynamo estimated cell-wise + transcripts. Furthermore, you can also replace either the new or unspliced RNA with dynamo estimated cell-wise velocity, transcription, splicing and degradation rates for each gene (similarly, replacing the expression values of transcription factors with RNA binding, ribosome, epigenetics or epitranscriptomic factors, etc.) to infer the total regulatory effects, transcription, splicing and post-transcriptional regulation of different factors. - - Parameters - ---------- - adata: :class:`~anndata.AnnData`. - adata object that includes both newly synthesized and total gene expression of cells. Alternatively, + Args: + adata: Adata object that includes both newly synthesized and total gene expression of cells. Alternatively, the object should include both unspliced and spliced gene expression of cells. - genes: - The list of gene names that will be used for casual network inference. By default, it is `None` and thus + genes: The list of gene names that will be used for casual network inference. By default, it is `None` and thus will use all genes. - TFs: - The list of transcription factors that will be used for casual network inference. When it is `None` gene + TFs: The list of transcription factors that will be used for casual network inference. When it is `None` gene list included in the file linked by `motif_ref` will be used. - Targets: - The list of target genes that will be used for casual network inference. When it is `None` gene list not - included in the file linked by `motif_ref` will be used. - gene_filter_rate: - minimum percentage of expressed cells for gene filtering. - cell_filter_UMI: - minimum number of UMIs for cell filtering. - motif_ref: - It provides the list of TFs gene names and is used to parse the data to get the list of TFs and Targets - for the causal network inference from those TFs to Targets. But currently the motif based filtering is not - implemented. By default it is a dropbox link that store the data from us. Other motif reference can bed - downloaded from RcisTarget: https://resources.aertslab.org/cistarget/. For human motif matrix, it can be + Targets: The list of target genes that will be used for casual network inference. When it is `None` gene list + not included in the file linked by `motif_ref` will be used. + gene_filter_rate: Minimum percentage of expressed cells for gene filtering. + cell_filter_UMI: Minimum number of UMIs for cell filtering. + motif_ref: It provides the list of TFs gene names and is used to parse the data to get the list of TFs and + Targets for the causal network inference from those TFs to Targets. But currently the motif based filtering + is not implemented. By default, it is a dropbox link that store the data from us. Other motif reference can + bed downloaded from RcisTarget: https://resources.aertslab.org/cistarget/. For human motif matrix, it can be downloaded from June's shared folder: https://shendure-web.gs.washington.edu/content/members/cao1025/public/nobackup/sci_fate/data/hg19-tss- centered-10kb-7species.mc9nr.feather - nt_layers: - The two keys for layers that will be used for the network inference. Note that the layers can be changed - flexibly. See the description of this function above. The first key corresponds to the transcriptome of the - next time point, for example unspliced RNAs (or estimated velocitym, see Fig 6 of the Scribe preprint: + nt_layers: The two keys for layers that will be used for the network inference. Note that the layers can be + changed flexibly. See the description of this function above. The first key corresponds to the transcriptome + of the next time point, for example unspliced RNAs (or estimated velocity, see Fig 6 of the Scribe preprint: https://www.biorxiv.org/content/10.1101/426981v1) from RNA velocity, new RNA from scSLAM-seq data, etc. The second key corresponds to the transcriptome of the initial time point, for example spliced RNAs from RNA velocity, old RNA from scSLAM-seq data. - drop_zero_cells: - Whether to drop cells that with zero expression for either the potential regulator or potential target. This - can signify the relationship between potential regulators and targets, speed up the calculation, but at the - risk of ignoring strong inhibition effects from certain regulators to targets. - do_CLR: - Whether to perform context likelihood relatedness analysis on the reconstructed causal network - TF_link_ENCODE_ref: - The path to the TF chip-seq data. By default it is a dropbox link from us that stores the data. Other data - can be downloaded from: https://amp.pharm.mssm.edu/Harmonizome/dataset/ENCODE+Transcription+Factor+Targets. - - Returns - ------- + drop_zero_cells: Whether to drop cells that with zero expression for either the potential regulator or potential + target. This can signify the relationship between potential regulators and targets, speed up the calculation, + but at the risk of ignoring strong inhibition effects from certain regulators to targets. + do_CLR: Whether to perform context likelihood relatedness analysis on the reconstructed causal network + TF_link_ENCODE_ref: The path to the TF chip-seq data. By default, it is a dropbox link from us that stores the + data. Other data can be downloaded from: + https://amp.pharm.mssm.edu/Harmonizome/dataset/ENCODE+Transcription+Factor+Targets. + + Returns: An updated adata object with a new key `causal_net` in .uns attribute, which stores the inferred causal network. """ @@ -221,28 +210,27 @@ def scribe( return adata -def coexp_measure(adata, genes, layer_x, layer_y, cores=1, skip_mi=True): +def coexp_measure( + adata: AnnData, + genes: List, + layer_x: str, + layer_y: str, + cores: int = 1, + skip_mi: bool = True, +): """Calculate co-expression measures, including mutual information (MI), pearson correlation, etc. of genes between two different layers. - Parameters - ---------- - adata: :class:`~anndata.AnnData`. - adata object that will be used for mutual information calculation. - genes: `List` (default: None) - Gene names from the adata object that will be used for mutual information calculation. - layer_x: `str` - The first key of the layer from the adata object that will be used for mutual information calculation. - layer_y: `str` - The second key of the layer from the adata object that will be used for mutual information calculation. - cores: `int` (default: 1) - Number of cores to run the MI calculation. If cores is set to be > 1, multiprocessing will be used to + Args: + adata: Adata object that will be used for mutual information calculation. + genes: Gene names from the adata object that will be used for mutual information calculation. + layer_x: The first key of the layer from the adata object that will be used for mutual information calculation. + layer_y: The second key of the layer from the adata object that will be used for mutual information calculation. + cores: Number of cores to run the MI calculation. If cores is set to be > 1, multiprocessing will be used to parallel the calculation. `cores` is only applicable to MI calculation. - skip_mi: `bool` (default: `True`) - Whether to skip the mutual information calculation step which is time-consuming. + skip_mi: Whether to skip the mutual information calculation step which is time-consuming. - Returns - ------- + Returns: An updated adata object that updated with a new columns (`mi`, `pearson`) in .var contains the mutual information of input genes. """ @@ -300,20 +288,21 @@ def pool_mi(x, y, k): def coexp_measure_mat( - adata, - TFs=None, - Targets=None, - guide_keys=None, - t0_key="spliced", - t1_key="velocity", - normalize=True, - drop_zero_cells=True, - skip_mi=True, - cores=1, - copy=False, -): + adata: AnnData, + TFs: Optional[List] = None, + Targets: Optional[List] = None, + guide_keys: Optional[List] = None, + t0_key: str = "spliced", + t1_key: str = "velocity", + normalize: bool = True, + drop_zero_cells: bool = True, + skip_mi: bool = True, + cores: int = 1, + copy: bool = False, +) -> AnnData: """Infer causal networks with dynamics-coupled single cells measurements. - Network inference is a insanely challenging problem which has a long history and that none of the existing + + Network inference is an insanely challenging problem which has a long history and that none of the existing algorithms work well. However, it's quite possible that one or more of the algorithms could work if only they were given enough data. Single-cell RNA-seq is exciting because it provides a ton of data. Somewhat surprisingly, just having a lot of single-cell RNA-seq data won't make causal inference work well. We need a fundamentally better type @@ -328,33 +317,23 @@ def coexp_measure_mat( advance may be still not sufficient, radically different methods, for example something like highly multiplexed live imaging that can record many genes may be needed. - Arguments - --------- - adata: `anndata` - Annotated data matrix. - TFs: `List` or `None` (default: None) - The list of transcription factors that will be used for casual network inference. - Targets: `List` or `None` (default: None) - The list of target genes that will be used for casual network inference. - guide_keys: `List` (default: None) - The key of the CRISPR-guides, stored as a column in the .obs attribute. This argument is useful - for identifying the knockout or knockin genes for a perturb-seq experiment. Currently not used. - t0_key: `str` (default: spliced) - Key corresponds to the transcriptome of the initial time point, for example spliced RNAs from RNA velocity, old - RNA from scSLAM-seq data. - t1_key: `str` (default: velocity) - Key corresponds to the transcriptome of the next time point, for example unspliced RNAs (or estimated velocity, - see Fig 6 of the Scribe preprint) from RNA velocity, old RNA from scSLAM-seq data. - normalize: `bool` - Whether to scale the expression or velocity values into 0 to 1 before calculating causal networks. - drop_zero_cells: `bool` (Default: True) - Whether to drop cells that with zero expression for either the potential regulator or potential target. This - can signify the relationship between potential regulators and targets, speed up the calculation, but at the risk - of ignoring strong inhibition effects from certain regulators to targets. - copy: `bool` - Whether to return a copy of the adata or just update adata in place. - Returns - --------- + Args: + adata: Annotated data matrix. + TFs: The list of transcription factors that will be used for casual network inference. + Targets: The list of target genes that will be used for casual network inference. + guide_keys: The key of the CRISPR-guides, stored as a column in the .obs attribute. This argument is useful + for identifying the knockout or knockin genes for a perturb-seq experiment. Currently not used. + t0_key: Key corresponds to the transcriptome of the initial time point, for example spliced RNAs from RNA + velocity, old RNA from scSLAM-seq data. + t1_key: Key corresponds to the transcriptome of the next time point, for example unspliced RNAs (or estimated + velocity, see Fig 6 of the Scribe preprint) from RNA velocity, old RNA from scSLAM-seq data. + normalize: Whether to scale the expression or velocity values into 0 to 1 before calculating causal networks. + drop_zero_cells: Whether to drop cells that with zero expression for either the potential regulator or potential + target. This can signify the relationship between potential regulators and targets, speed up the + calculation, but at the risk of ignoring strong inhibition effects from certain regulators to targets. + copy: Whether to return a copy of the adata or just update adata in place. + + Returns: An update AnnData object with inferred causal network stored as a matrix related to the key `causal_net` in the `uns` slot. """ diff --git a/dynamo/external/utils.py b/dynamo/external/utils.py index cfb99be89..034e9db4b 100644 --- a/dynamo/external/utils.py +++ b/dynamo/external/utils.py @@ -6,8 +6,21 @@ from ..tools.utils import fdr -def normalize_data(mm, szfactors, pseudo_expr: float = 0.1): - """normalize data via size factor and scaling.""" +def normalize_data( + mm: np.ndarray, + szfactors: np.ndarray, + pseudo_expr: float = 0.1, +): + """Normalize data via size factor and scaling. + + Args: + mm: The input data matrix. + szfactors: The size factors for each cell. + pseudo_expr: The pseudo expression value to add before log transformation. + + Returns: + The normalized data matrix. + """ mm = mm / szfactors @@ -27,8 +40,23 @@ def normalize_data(mm, szfactors, pseudo_expr: float = 0.1): return mm -def TF_link_gene_chip(raw_glmnet_res_var, df_gene_TF_link_ENCODE, var, cor_thresh: float = 0.02): - """Filter the raw lasso regression links via chip-seq data based on a Fisher exact test""" +def TF_link_gene_chip( + raw_glmnet_res_var: pd.DataFrame, + df_gene_TF_link_ENCODE: pd.DataFrame, + var: pd.DataFrame, + cor_thresh: float = 0.02, +): + """Filter the raw lasso regression links via chip-seq data based on a Fisher exact test. + + Args: + raw_glmnet_res_var: The raw glmnet results. + df_gene_TF_link_ENCODE: The gene-TF link data from ENCODE. + var: The gene data. + cor_thresh: The correlation threshold. + + Returns: + The filtered gene-TF link data. + """ glmnet_res_var_filtered = raw_glmnet_res_var.query("abs(corcoef) > @cor_thresh") if glmnet_res_var_filtered.shape[0] < 1000: From 8553cd8bec806105f6211b1bb83b5c52b4af2944 Mon Sep 17 00:00:00 2001 From: sichao Date: Wed, 21 Feb 2024 17:03:52 -0500 Subject: [PATCH 2/3] add output data type hints --- dynamo/external/hodge.py | 4 ++-- dynamo/external/scifate.py | 4 ++-- dynamo/external/scribe.py | 4 ++-- dynamo/external/utils.py | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/dynamo/external/hodge.py b/dynamo/external/hodge.py index 778094b80..b30fa7ecf 100644 --- a/dynamo/external/hodge.py +++ b/dynamo/external/hodge.py @@ -40,7 +40,7 @@ def ddhodge( enforce: bool = False, cores: int = 1, **kwargs, -): +) -> None: """Modeling Latent Flow Structure using Hodge Decomposition based on the creation of sparse diffusion graph from the reconstructed vector field function. This method is relevant to the curl-free/divergence-free vector field reconstruction. @@ -71,7 +71,7 @@ def ddhodge( used to parallel the graphize_vecfld calculation. Returns: - `AnnData` object that is updated with the `ddhodge` key in the `obsp` attribute which to adjacency matrix + `AnnData` object wil be updated with the `ddhodge` key in the `obsp` attribute which to adjacency matrix that corresponds to the sparse diffusion graph. Two columns `potential` and `divergence` corresponds to the potential and divergence for each cell will also be added. """ diff --git a/dynamo/external/scifate.py b/dynamo/external/scifate.py index 98aad31e0..76eeaee61 100644 --- a/dynamo/external/scifate.py +++ b/dynamo/external/scifate.py @@ -2,7 +2,7 @@ # the following code is based on Cao, et. al, Nature Biotechnology, 2020 and # https://github.com/JunyueC/sci-fate_analysis -from typing import Dict +from typing import Dict, Union import numpy as np import pandas as pd @@ -277,7 +277,7 @@ def TF_gene_link( core_num: int = 1, cor_thresh: float = 0.03, seed: int = 123456, -): +) -> Union[pd.DataFrame, str]: """Estimate the regulatory weight of each TF to its potential targets via lasso regression for each gene. Args: diff --git a/dynamo/external/scribe.py b/dynamo/external/scribe.py index 8a3f2a0d2..587140d2f 100644 --- a/dynamo/external/scribe.py +++ b/dynamo/external/scribe.py @@ -217,7 +217,7 @@ def coexp_measure( layer_y: str, cores: int = 1, skip_mi: bool = True, -): +) -> None: """Calculate co-expression measures, including mutual information (MI), pearson correlation, etc. of genes between two different layers. @@ -231,7 +231,7 @@ def coexp_measure( skip_mi: Whether to skip the mutual information calculation step which is time-consuming. Returns: - An updated adata object that updated with a new columns (`mi`, `pearson`) in .var contains the mutual + None. The adata object will be updated with a new columns (`mi`, `pearson`) in .var contains the mutual information of input genes. """ diff --git a/dynamo/external/utils.py b/dynamo/external/utils.py index 034e9db4b..7f5f3a624 100644 --- a/dynamo/external/utils.py +++ b/dynamo/external/utils.py @@ -10,7 +10,7 @@ def normalize_data( mm: np.ndarray, szfactors: np.ndarray, pseudo_expr: float = 0.1, -): +) -> np.ndarray: """Normalize data via size factor and scaling. Args: @@ -45,7 +45,7 @@ def TF_link_gene_chip( df_gene_TF_link_ENCODE: pd.DataFrame, var: pd.DataFrame, cor_thresh: float = 0.02, -): +) -> pd.DataFrame: """Filter the raw lasso regression links via chip-seq data based on a Fisher exact test. Args: From 54c95c1bdd895ad77a68b36b767df04bd2658614 Mon Sep 17 00:00:00 2001 From: sichao Date: Wed, 21 Feb 2024 17:06:14 -0500 Subject: [PATCH 3/3] add missing dot --- dynamo/external/gseapy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dynamo/external/gseapy.py b/dynamo/external/gseapy.py index bdd1d1902..220fb6e1a 100644 --- a/dynamo/external/gseapy.py +++ b/dynamo/external/gseapy.py @@ -23,7 +23,7 @@ def enrichr( see here for details: https://amp.pharm.mssm.edu/modEnrichr gene_sets: str, list, tuple of Enrichr Library name(s). description: Name of analysis. optional. - outdir: Output file directory + outdir: Output file directory. cutoff: Show enriched terms which Adjusted P-value < cutoff. Only affects the output figure. Default: 0.05 kwargs: Additional arguments passed to the `gp.enrichr` function.