Skip to content

Commit

Permalink
add human-mouse
Browse files Browse the repository at this point in the history
  • Loading branch information
zhouyiqi91 committed Sep 10, 2024
1 parent ba4a59a commit 73fd3aa
Show file tree
Hide file tree
Showing 6 changed files with 343 additions and 2 deletions.
4 changes: 4 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"python.analysis.typeCheckingMode": "basic",
"python.analysis.autoImportCompletions": true
}
2 changes: 1 addition & 1 deletion sccore/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
VERSION = "1.0.1"
VERSION = "1.0.2"
88 changes: 88 additions & 0 deletions sccore/cli/human_mouse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
"""
estimate ambient from human_mouse sample
"""

from sccore.matrix import CountMatrix
import pandas as pd
import argparse
import glob
import os
import functools


def get_metrics_dict(mtx_path, doublet_threshold):
mtx = CountMatrix.from_matrix_dir(mtx_path)
m = mtx.get_matrix()
f = mtx.get_features()

human = []
mouse = []
for i, gn in enumerate(f.gene_name):
if gn == gn.upper():
human.append(i)
else:
mouse.append(i)
umi_h = m.tocsr()[human, :].sum(axis=0)
umi_m = m.tocsr()[mouse, :].sum(axis=0)
df_m = pd.DataFrame(umi_m.T, columns=["mouse"])
df_h = pd.DataFrame(umi_h.T, columns=["human"])
df = pd.merge(df_m, df_h, left_index=True, right_index=True)
df["umi_sum"] = df.sum(axis=1)
df["human_percent"] = df["human"] / df["umi_sum"] * 100
df["mouse_percent"] = df["mouse"] / df["umi_sum"] * 100
df["identity"] = "doublet"
df.loc[df["human_percent"] < doublet_threshold, "identity"] = "mouse"
df.loc[df["mouse_percent"] < doublet_threshold, "identity"] = "human"
df.loc[df["identity"] == "human", "ambient_percent"] = df[df["identity"] == "human"].mouse_percent
df.loc[df["identity"] == "mouse", "ambient_percent"] = df[df["identity"] == "mouse"].human_percent

n_cell = df.shape[0]
n_doublet = df[df["identity"] == "doublet"].shape[0]
doublet_percent = round(n_doublet / n_cell * 100, 2)
dict = {
"n_cell": n_cell,
"n_human_cell": df[df["identity"] == "human"].shape[0],
"n_mouse_cell": df[df["identity"] == "mouse"].shape[0],
"doublet_umi_percent_threshold": doublet_threshold,
"n_doublet": n_doublet,
"doublet_cell_percent": doublet_percent,
"median_ambient_umi_percent": round(df["ambient_percent"].median(), 2),
"mean_ambient_umi_percent": round(df["ambient_percent"].mean(), 2),
}

return dict


def main():
parser = argparse.ArgumentParser()
parser.add_argument("-c", "--celescope_dir", type=str, required=True)
parser.add_argument("--doublet_threshold", type=float, default=25.0)
parser.add_argument("--out_prefix", type=str, default="human_mouse")
args = parser.parse_args()

cur_dir = os.getcwd()
dfs = []
for d in args.celescope_dir.split(","):
os.chdir(d)
paths = glob.glob("*/filtered")
for path in paths:
sample = path.split("/")[0]
dict = get_metrics_dict(path, args.doublet_threshold)
dfs.append(pd.DataFrame.from_dict(dict, orient="index", columns=[sample]))

df = functools.reduce(lambda left, right: pd.merge(left, right, left_index=True, right_index=True), dfs)
df = df.T
df = df.astype(
{
"n_cell": int,
"n_human_cell": int,
"n_mouse_cell": int,
"n_doublet": int,
}
)
os.chdir(cur_dir)
df.to_csv(f"{args.out_prefix}.tsv", sep="\t")


if __name__ == "__main__":
main()
248 changes: 248 additions & 0 deletions sccore/matrix.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,248 @@
import gzip
import os
from collections import defaultdict

import scipy.io
import scipy.sparse
import pandas as pd
from . import utils


BARCODE_FILE_NAME = "barcodes.tsv.gz"
FEATURE_FILE_NAME = "features.tsv.gz"
MATRIX_FILE_NAME = "matrix.mtx.gz"


ROW = "geneID"
COLUMN = "Barcode"


class Features:
def __init__(self, gene_id: list, gene_name=None, gene_type=None):
"""
Args:
gene_id: list of gene id
gene_name: list of gene name
type: ype of features, e.g. [gene, protein]
"""
self.gene_id = list(gene_id)
if not gene_name:
self.gene_name = self.gene_id
else:
self.gene_name = list(gene_name)
self.gene_type = gene_type

@classmethod
def from_tsv(cls, tsv_file):
if (not tsv_file) or (not os.path.exists(tsv_file)):
raise FileNotFoundError(
f"ERROR: {tsv_file} does not exist. \nSOLUTION: Rename and gzip the features file to {FEATURE_FILE_NAME}\n"
)
df = pd.read_csv(
tsv_file,
sep="\t",
on_bad_lines="skip",
names=["gene_id", "gene_name", "type"],
dtype=str,
)
gene_id = df["gene_id"].tolist()
gene_name = df["gene_name"].tolist()
# avoid adding extra \t to genes.tsv when all the gene_type are Nan
# if gene_type is None and add to dataframe, will cause Seurat::Read error: Error in FUN(X[[i]], ...) : # # subscript out of bounds
if df["type"].isnull().sum() == len(df["type"]):
gene_type = None
else:
gene_type = df["type"].tolist()
return cls(gene_id, gene_name, gene_type)

def to_tsv(self, tsv_file):
"""
if gene_type is None and add to dataframe, will cause Seurat::Read10X error: Error in FUN(X[[i]], ...) : subscript out of bounds
"""
if self.gene_type:
df = pd.DataFrame(
{
"gene_id": self.gene_id,
"gene_name": self.gene_name,
"gene_type": self.gene_type,
}
)
else:
df = pd.DataFrame({"gene_id": self.gene_id, "gene_name": self.gene_name})
df.to_csv(tsv_file, sep="\t", index=False, header=False)


def get_matrix_file_path(matrix_dir, file_name):
"""
compatible with non-gzip file
"""
non_gzip = file_name.strip(".gz")
file_path_list = [f"{matrix_dir}/{file_name}", f"{matrix_dir}/{non_gzip}"]
for file_path in file_path_list:
if os.path.exists(file_path):
return file_path


class CountMatrix:
def __init__(self, features: Features, barcodes: list, matrix):
"""
Args:
features: Features object
barcodes: list of barcodes
matrix: scipy.sparse.coo_matrix
"""
self.__features = features
self.__barcodes = barcodes
self.__matrix = matrix
self.shape = matrix.shape

@staticmethod
@utils.add_log
def read_barcodes(matrix_dir):
"""
returns: barcodes set
"""
barcode_file = get_matrix_file_path(matrix_dir, BARCODE_FILE_NAME)
barcodes, _ = utils.read_one_col(barcode_file)
return set(barcodes)

@classmethod
@utils.add_log
def from_matrix_dir(cls, matrix_dir):
if not os.path.exists(matrix_dir):
raise FileNotFoundError(f"{matrix_dir} does not exist")
features_tsv = get_matrix_file_path(matrix_dir, FEATURE_FILE_NAME)
features = Features.from_tsv(tsv_file=features_tsv)
barcode_file = get_matrix_file_path(matrix_dir, BARCODE_FILE_NAME)
barcodes = utils.read_one_col(barcode_file)
matrix_path = get_matrix_file_path(matrix_dir, MATRIX_FILE_NAME)
matrix = scipy.io.mmread(matrix_path)

return cls(features, barcodes, matrix)

@utils.add_log
def to_matrix_dir(self, matrix_dir):
os.mkdir(matrix_dir)
self.__features.to_tsv(f"{matrix_dir}/{FEATURE_FILE_NAME}")
pd.Series(self.__barcodes).to_csv(f"{matrix_dir}/{BARCODE_FILE_NAME}", index=False, sep="\t", header=False)
matrix_path = f"{matrix_dir}/{MATRIX_FILE_NAME}"
with gzip.open(matrix_path, "wb") as f:
scipy.io.mmwrite(f, self.__matrix)

@classmethod
def from_dataframe(cls, df, features: Features, barcodes=None, value="UMI"):
"""
Use all gene_id from features even if it is not in df
Args:
df: count details dataframe with columns UMI and multi-index (column, row).
features: Features
barcodes: only keep barcode in barcodes
value: value name in df, UMI
"""
if not barcodes:
barcodes = df.index.levels[0].tolist()

feature_index_dict = {}
for index, gene_id in enumerate(features.gene_id):
feature_index_dict[gene_id] = index

barcode_index_dict = {}
for index, barcode in enumerate(barcodes):
barcode_index_dict[barcode] = index

# use all barcodes
barcode_codes = [barcode_index_dict[barcode] for barcode in df.index.get_level_values(level=0)]
# use all gene_id from features even if it is not in df
gene_id_codes = [feature_index_dict[gene_id] for gene_id in df.index.get_level_values(level=1)]
mtx = scipy.sparse.coo_matrix(
(df[value], (gene_id_codes, barcode_codes)),
shape=(len(features.gene_id), len(barcodes)),
)

return cls(features, barcodes, mtx)

def __str__(self):
n_row, n_col = self.shape[0], self.shape[1]
return f"CountMatrix object\n {n_row} x {n_col} coo_matrix"

def __repr__(self):
return self.__str__()

def concat_by_barcodes(self, other):
if other.get_barcodes() != self.get_barcodes():
raise ValueError("barcodes are not the same")

if inter := set(self.get_features().gene_id).intersection(set(other.get_features().gene_id)):
raise ValueError(f"deuplicated gene_id: {inter}")

gene_id = self.get_features().gene_id + other.get_features().gene_id
gene_name = self.get_features().gene_name + other.get_features().gene_name
gene_type = None
if self.get_features().gene_type and other.get_features().gene_type:
gene_type = self.get_features().gene_type + other.get_features().gene_type
features = Features(gene_id, gene_name, gene_type)

matrix = scipy.sparse.vstack([self.get_matrix(), other.get_matrix()])

return CountMatrix(features, self.__barcodes, matrix)

@utils.add_log
def slice_matrix(self, slice_barcodes_indices):
"""
Args:
slice_barcodes_indices: list of barcode indices
Returns:
CountMatrix object
"""
mtx_csc = self.__matrix.tocsc()
slice_barcodes_indices.sort()
sliced_mtx = mtx_csc[:, slice_barcodes_indices]
barcodes = [self.__barcodes[i] for i in slice_barcodes_indices]
return CountMatrix(self.__features, barcodes, sliced_mtx)

@utils.add_log
def slice_matrix_bc(self, bcs):
"""
Args:
bcs: cell barcodes
Returns:
CountMatrix object
"""
barcodes_indices = [self.__barcodes.index(barcode) for barcode in bcs]
barcodes_indices.sort()
return self.slice_matrix(barcodes_indices)

@utils.add_log
def get_genes_fraction(self, gene_list):
"""
Returns:
numpy 2d fraction of gene_names in gene_list
"""
gene_indices = [self.__features.gene_name.index(gene) for gene in gene_list]
mtx = self.__matrix.tocsc()
total = mtx.sum(axis=0)
gene = mtx[gene_indices, :].sum(axis=0)
f = gene / total

return f

def get_bc_geneNum(self):
"""
Returns {bc: geneNum}, total_genes
"""
gene_index, bc_index = self.__matrix.nonzero()
total_genes = len(set(gene_index))
bc_gene = defaultdict(set)
for gene, bc in zip(gene_index, bc_index):
bc_gene[bc].add(gene)
return {bc: len(bc_gene[bc]) for bc in bc_gene}, total_genes

def get_barcodes(self):
return self.__barcodes

def get_features(self):
return self.__features

def get_matrix(self):
return self.__matrix
2 changes: 1 addition & 1 deletion sccore/utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from collections import defaultdict
import gzip
import json
import logging
import sys
import time
from collections import defaultdict
from datetime import timedelta
from functools import wraps

Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
install_requires = [
"pysam",
"pandas>=2.0.0",
"scipy",
]

entrys = []
Expand Down

0 comments on commit 73fd3aa

Please sign in to comment.