add human-mouse

singleron-RD · Sep 10, 2024 · 73fd3aa · 73fd3aa
1 parent ba4a59a
commit 73fd3aa
Show file tree

Hide file tree

Showing 6 changed files with 343 additions and 2 deletions.
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,4 @@
+{
+    "python.analysis.typeCheckingMode": "basic",
+    "python.analysis.autoImportCompletions": true
+}
diff --git a/sccore/__init__.py b/sccore/__init__.py
@@ -1 +1 @@
-VERSION = "1.0.1"
+VERSION = "1.0.2"
diff --git a/sccore/cli/human_mouse.py b/sccore/cli/human_mouse.py
@@ -0,0 +1,88 @@
+"""
+estimate ambient from human_mouse sample
+"""
+
+from sccore.matrix import CountMatrix
+import pandas as pd
+import argparse
+import glob
+import os
+import functools
+
+
+def get_metrics_dict(mtx_path, doublet_threshold):
+    mtx = CountMatrix.from_matrix_dir(mtx_path)
+    m = mtx.get_matrix()
+    f = mtx.get_features()
+
+    human = []
+    mouse = []
+    for i, gn in enumerate(f.gene_name):
+        if gn == gn.upper():
+            human.append(i)
+        else:
+            mouse.append(i)
+    umi_h = m.tocsr()[human, :].sum(axis=0)
+    umi_m = m.tocsr()[mouse, :].sum(axis=0)
+    df_m = pd.DataFrame(umi_m.T, columns=["mouse"])
+    df_h = pd.DataFrame(umi_h.T, columns=["human"])
+    df = pd.merge(df_m, df_h, left_index=True, right_index=True)
+    df["umi_sum"] = df.sum(axis=1)
+    df["human_percent"] = df["human"] / df["umi_sum"] * 100
+    df["mouse_percent"] = df["mouse"] / df["umi_sum"] * 100
+    df["identity"] = "doublet"
+    df.loc[df["human_percent"] < doublet_threshold, "identity"] = "mouse"
+    df.loc[df["mouse_percent"] < doublet_threshold, "identity"] = "human"
+    df.loc[df["identity"] == "human", "ambient_percent"] = df[df["identity"] == "human"].mouse_percent
+    df.loc[df["identity"] == "mouse", "ambient_percent"] = df[df["identity"] == "mouse"].human_percent
+
+    n_cell = df.shape[0]
+    n_doublet = df[df["identity"] == "doublet"].shape[0]
+    doublet_percent = round(n_doublet / n_cell * 100, 2)
+    dict = {
+        "n_cell": n_cell,
+        "n_human_cell": df[df["identity"] == "human"].shape[0],
+        "n_mouse_cell": df[df["identity"] == "mouse"].shape[0],
+        "doublet_umi_percent_threshold": doublet_threshold,
+        "n_doublet": n_doublet,
+        "doublet_cell_percent": doublet_percent,
+        "median_ambient_umi_percent": round(df["ambient_percent"].median(), 2),
+        "mean_ambient_umi_percent": round(df["ambient_percent"].mean(), 2),
+    }
+
+    return dict
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-c", "--celescope_dir", type=str, required=True)
+    parser.add_argument("--doublet_threshold", type=float, default=25.0)
+    parser.add_argument("--out_prefix", type=str, default="human_mouse")
+    args = parser.parse_args()
+
+    cur_dir = os.getcwd()
+    dfs = []
+    for d in args.celescope_dir.split(","):
+        os.chdir(d)
+        paths = glob.glob("*/filtered")
+        for path in paths:
+            sample = path.split("/")[0]
+            dict = get_metrics_dict(path, args.doublet_threshold)
+            dfs.append(pd.DataFrame.from_dict(dict, orient="index", columns=[sample]))
+
+    df = functools.reduce(lambda left, right: pd.merge(left, right, left_index=True, right_index=True), dfs)
+    df = df.T
+    df = df.astype(
+        {
+            "n_cell": int,
+            "n_human_cell": int,
+            "n_mouse_cell": int,
+            "n_doublet": int,
+        }
+    )
+    os.chdir(cur_dir)
+    df.to_csv(f"{args.out_prefix}.tsv", sep="\t")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/sccore/matrix.py b/sccore/matrix.py
@@ -0,0 +1,248 @@
+import gzip
+import os
+from collections import defaultdict
+
+import scipy.io
+import scipy.sparse
+import pandas as pd
+from . import utils
+
+
+BARCODE_FILE_NAME = "barcodes.tsv.gz"
+FEATURE_FILE_NAME = "features.tsv.gz"
+MATRIX_FILE_NAME = "matrix.mtx.gz"
+
+
+ROW = "geneID"
+COLUMN = "Barcode"
+
+
+class Features:
+    def __init__(self, gene_id: list, gene_name=None, gene_type=None):
+        """
+        Args:
+            gene_id: list of gene id
+            gene_name: list of gene name
+            type: ype of features, e.g. [gene, protein]
+        """
+        self.gene_id = list(gene_id)
+        if not gene_name:
+            self.gene_name = self.gene_id
+        else:
+            self.gene_name = list(gene_name)
+        self.gene_type = gene_type
+
+    @classmethod
+    def from_tsv(cls, tsv_file):
+        if (not tsv_file) or (not os.path.exists(tsv_file)):
+            raise FileNotFoundError(
+                f"ERROR: {tsv_file} does not exist. \nSOLUTION: Rename and gzip the features file to {FEATURE_FILE_NAME}\n"
+            )
+        df = pd.read_csv(
+            tsv_file,
+            sep="\t",
+            on_bad_lines="skip",
+            names=["gene_id", "gene_name", "type"],
+            dtype=str,
+        )
+        gene_id = df["gene_id"].tolist()
+        gene_name = df["gene_name"].tolist()
+        # avoid adding extra \t to genes.tsv when all the gene_type are Nan
+        # if gene_type is None and add to dataframe, will cause Seurat::Read error: Error in FUN(X[[i]], ...) : # # subscript out of bounds
+        if df["type"].isnull().sum() == len(df["type"]):
+            gene_type = None
+        else:
+            gene_type = df["type"].tolist()
+        return cls(gene_id, gene_name, gene_type)
+
+    def to_tsv(self, tsv_file):
+        """
+        if gene_type is None and add to dataframe, will cause Seurat::Read10X error: Error in FUN(X[[i]], ...) : subscript out of bounds
+        """
+        if self.gene_type:
+            df = pd.DataFrame(
+                {
+                    "gene_id": self.gene_id,
+                    "gene_name": self.gene_name,
+                    "gene_type": self.gene_type,
+                }
+            )
+        else:
+            df = pd.DataFrame({"gene_id": self.gene_id, "gene_name": self.gene_name})
+        df.to_csv(tsv_file, sep="\t", index=False, header=False)
+
+
+def get_matrix_file_path(matrix_dir, file_name):
+    """
+    compatible with non-gzip file
+    """
+    non_gzip = file_name.strip(".gz")
+    file_path_list = [f"{matrix_dir}/{file_name}", f"{matrix_dir}/{non_gzip}"]
+    for file_path in file_path_list:
+        if os.path.exists(file_path):
+            return file_path
+
+
+class CountMatrix:
+    def __init__(self, features: Features, barcodes: list, matrix):
+        """
+        Args:
+            features: Features object
+            barcodes: list of barcodes
+            matrix: scipy.sparse.coo_matrix
+        """
+        self.__features = features
+        self.__barcodes = barcodes
+        self.__matrix = matrix
+        self.shape = matrix.shape
+
+    @staticmethod
+    @utils.add_log
+    def read_barcodes(matrix_dir):
+        """
+        returns: barcodes set
+        """
+        barcode_file = get_matrix_file_path(matrix_dir, BARCODE_FILE_NAME)
+        barcodes, _ = utils.read_one_col(barcode_file)
+        return set(barcodes)
+
+    @classmethod
+    @utils.add_log
+    def from_matrix_dir(cls, matrix_dir):
+        if not os.path.exists(matrix_dir):
+            raise FileNotFoundError(f"{matrix_dir} does not exist")
+        features_tsv = get_matrix_file_path(matrix_dir, FEATURE_FILE_NAME)
+        features = Features.from_tsv(tsv_file=features_tsv)
+        barcode_file = get_matrix_file_path(matrix_dir, BARCODE_FILE_NAME)
+        barcodes = utils.read_one_col(barcode_file)
+        matrix_path = get_matrix_file_path(matrix_dir, MATRIX_FILE_NAME)
+        matrix = scipy.io.mmread(matrix_path)
+
+        return cls(features, barcodes, matrix)
+
+    @utils.add_log
+    def to_matrix_dir(self, matrix_dir):
+        os.mkdir(matrix_dir)
+        self.__features.to_tsv(f"{matrix_dir}/{FEATURE_FILE_NAME}")
+        pd.Series(self.__barcodes).to_csv(f"{matrix_dir}/{BARCODE_FILE_NAME}", index=False, sep="\t", header=False)
+        matrix_path = f"{matrix_dir}/{MATRIX_FILE_NAME}"
+        with gzip.open(matrix_path, "wb") as f:
+            scipy.io.mmwrite(f, self.__matrix)
+
+    @classmethod
+    def from_dataframe(cls, df, features: Features, barcodes=None, value="UMI"):
+        """
+        Use all gene_id from features even if it is not in df
+        Args:
+            df: count details dataframe with columns UMI and multi-index (column, row).
+            features: Features
+            barcodes: only keep barcode in barcodes
+            value: value name in df, UMI
+
+        """
+        if not barcodes:
+            barcodes = df.index.levels[0].tolist()
+
+        feature_index_dict = {}
+        for index, gene_id in enumerate(features.gene_id):
+            feature_index_dict[gene_id] = index
+
+        barcode_index_dict = {}
+        for index, barcode in enumerate(barcodes):
+            barcode_index_dict[barcode] = index
+
+        # use all barcodes
+        barcode_codes = [barcode_index_dict[barcode] for barcode in df.index.get_level_values(level=0)]
+        # use all gene_id from features even if it is not in df
+        gene_id_codes = [feature_index_dict[gene_id] for gene_id in df.index.get_level_values(level=1)]
+        mtx = scipy.sparse.coo_matrix(
+            (df[value], (gene_id_codes, barcode_codes)),
+            shape=(len(features.gene_id), len(barcodes)),
+        )
+
+        return cls(features, barcodes, mtx)
+
+    def __str__(self):
+        n_row, n_col = self.shape[0], self.shape[1]
+        return f"CountMatrix object\n {n_row} x {n_col} coo_matrix"
+
+    def __repr__(self):
+        return self.__str__()
+
+    def concat_by_barcodes(self, other):
+        if other.get_barcodes() != self.get_barcodes():
+            raise ValueError("barcodes are not the same")
+
+        if inter := set(self.get_features().gene_id).intersection(set(other.get_features().gene_id)):
+            raise ValueError(f"deuplicated gene_id: {inter}")
+
+        gene_id = self.get_features().gene_id + other.get_features().gene_id
+        gene_name = self.get_features().gene_name + other.get_features().gene_name
+        gene_type = None
+        if self.get_features().gene_type and other.get_features().gene_type:
+            gene_type = self.get_features().gene_type + other.get_features().gene_type
+        features = Features(gene_id, gene_name, gene_type)
+
+        matrix = scipy.sparse.vstack([self.get_matrix(), other.get_matrix()])
+
+        return CountMatrix(features, self.__barcodes, matrix)
+
+    @utils.add_log
+    def slice_matrix(self, slice_barcodes_indices):
+        """
+        Args:
+            slice_barcodes_indices: list of barcode indices
+        Returns:
+            CountMatrix object
+        """
+        mtx_csc = self.__matrix.tocsc()
+        slice_barcodes_indices.sort()
+        sliced_mtx = mtx_csc[:, slice_barcodes_indices]
+        barcodes = [self.__barcodes[i] for i in slice_barcodes_indices]
+        return CountMatrix(self.__features, barcodes, sliced_mtx)
+
+    @utils.add_log
+    def slice_matrix_bc(self, bcs):
+        """
+        Args:
+            bcs: cell barcodes
+        Returns:
+            CountMatrix object
+        """
+        barcodes_indices = [self.__barcodes.index(barcode) for barcode in bcs]
+        barcodes_indices.sort()
+        return self.slice_matrix(barcodes_indices)
+
+    @utils.add_log
+    def get_genes_fraction(self, gene_list):
+        """
+        Returns:
+            numpy 2d fraction of gene_names in gene_list
+        """
+        gene_indices = [self.__features.gene_name.index(gene) for gene in gene_list]
+        mtx = self.__matrix.tocsc()
+        total = mtx.sum(axis=0)
+        gene = mtx[gene_indices, :].sum(axis=0)
+        f = gene / total
+
+        return f
+
+    def get_bc_geneNum(self):
+        """
+        Returns {bc: geneNum}, total_genes
+        """
+        gene_index, bc_index = self.__matrix.nonzero()
+        total_genes = len(set(gene_index))
+        bc_gene = defaultdict(set)
+        for gene, bc in zip(gene_index, bc_index):
+            bc_gene[bc].add(gene)
+        return {bc: len(bc_gene[bc]) for bc in bc_gene}, total_genes
+
+    def get_barcodes(self):
+        return self.__barcodes
+
+    def get_features(self):
+        return self.__features
+
+    def get_matrix(self):
+        return self.__matrix
diff --git a/sccore/utils.py b/sccore/utils.py
@@ -1,9 +1,9 @@
-from collections import defaultdict
 import gzip
 import json
 import logging
 import sys
 import time
+from collections import defaultdict
 from datetime import timedelta
 from functools import wraps
 

diff --git a/setup.py b/setup.py
@@ -9,6 +9,7 @@
 install_requires = [
     "pysam",
     "pandas>=2.0.0",
+    "scipy",
 ]
 
 entrys = []