-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
ba4a59a
commit 73fd3aa
Showing
6 changed files
with
343 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
{ | ||
"python.analysis.typeCheckingMode": "basic", | ||
"python.analysis.autoImportCompletions": true | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
VERSION = "1.0.1" | ||
VERSION = "1.0.2" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
""" | ||
estimate ambient from human_mouse sample | ||
""" | ||
|
||
from sccore.matrix import CountMatrix | ||
import pandas as pd | ||
import argparse | ||
import glob | ||
import os | ||
import functools | ||
|
||
|
||
def get_metrics_dict(mtx_path, doublet_threshold): | ||
mtx = CountMatrix.from_matrix_dir(mtx_path) | ||
m = mtx.get_matrix() | ||
f = mtx.get_features() | ||
|
||
human = [] | ||
mouse = [] | ||
for i, gn in enumerate(f.gene_name): | ||
if gn == gn.upper(): | ||
human.append(i) | ||
else: | ||
mouse.append(i) | ||
umi_h = m.tocsr()[human, :].sum(axis=0) | ||
umi_m = m.tocsr()[mouse, :].sum(axis=0) | ||
df_m = pd.DataFrame(umi_m.T, columns=["mouse"]) | ||
df_h = pd.DataFrame(umi_h.T, columns=["human"]) | ||
df = pd.merge(df_m, df_h, left_index=True, right_index=True) | ||
df["umi_sum"] = df.sum(axis=1) | ||
df["human_percent"] = df["human"] / df["umi_sum"] * 100 | ||
df["mouse_percent"] = df["mouse"] / df["umi_sum"] * 100 | ||
df["identity"] = "doublet" | ||
df.loc[df["human_percent"] < doublet_threshold, "identity"] = "mouse" | ||
df.loc[df["mouse_percent"] < doublet_threshold, "identity"] = "human" | ||
df.loc[df["identity"] == "human", "ambient_percent"] = df[df["identity"] == "human"].mouse_percent | ||
df.loc[df["identity"] == "mouse", "ambient_percent"] = df[df["identity"] == "mouse"].human_percent | ||
|
||
n_cell = df.shape[0] | ||
n_doublet = df[df["identity"] == "doublet"].shape[0] | ||
doublet_percent = round(n_doublet / n_cell * 100, 2) | ||
dict = { | ||
"n_cell": n_cell, | ||
"n_human_cell": df[df["identity"] == "human"].shape[0], | ||
"n_mouse_cell": df[df["identity"] == "mouse"].shape[0], | ||
"doublet_umi_percent_threshold": doublet_threshold, | ||
"n_doublet": n_doublet, | ||
"doublet_cell_percent": doublet_percent, | ||
"median_ambient_umi_percent": round(df["ambient_percent"].median(), 2), | ||
"mean_ambient_umi_percent": round(df["ambient_percent"].mean(), 2), | ||
} | ||
|
||
return dict | ||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("-c", "--celescope_dir", type=str, required=True) | ||
parser.add_argument("--doublet_threshold", type=float, default=25.0) | ||
parser.add_argument("--out_prefix", type=str, default="human_mouse") | ||
args = parser.parse_args() | ||
|
||
cur_dir = os.getcwd() | ||
dfs = [] | ||
for d in args.celescope_dir.split(","): | ||
os.chdir(d) | ||
paths = glob.glob("*/filtered") | ||
for path in paths: | ||
sample = path.split("/")[0] | ||
dict = get_metrics_dict(path, args.doublet_threshold) | ||
dfs.append(pd.DataFrame.from_dict(dict, orient="index", columns=[sample])) | ||
|
||
df = functools.reduce(lambda left, right: pd.merge(left, right, left_index=True, right_index=True), dfs) | ||
df = df.T | ||
df = df.astype( | ||
{ | ||
"n_cell": int, | ||
"n_human_cell": int, | ||
"n_mouse_cell": int, | ||
"n_doublet": int, | ||
} | ||
) | ||
os.chdir(cur_dir) | ||
df.to_csv(f"{args.out_prefix}.tsv", sep="\t") | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,248 @@ | ||
import gzip | ||
import os | ||
from collections import defaultdict | ||
|
||
import scipy.io | ||
import scipy.sparse | ||
import pandas as pd | ||
from . import utils | ||
|
||
|
||
BARCODE_FILE_NAME = "barcodes.tsv.gz" | ||
FEATURE_FILE_NAME = "features.tsv.gz" | ||
MATRIX_FILE_NAME = "matrix.mtx.gz" | ||
|
||
|
||
ROW = "geneID" | ||
COLUMN = "Barcode" | ||
|
||
|
||
class Features: | ||
def __init__(self, gene_id: list, gene_name=None, gene_type=None): | ||
""" | ||
Args: | ||
gene_id: list of gene id | ||
gene_name: list of gene name | ||
type: ype of features, e.g. [gene, protein] | ||
""" | ||
self.gene_id = list(gene_id) | ||
if not gene_name: | ||
self.gene_name = self.gene_id | ||
else: | ||
self.gene_name = list(gene_name) | ||
self.gene_type = gene_type | ||
|
||
@classmethod | ||
def from_tsv(cls, tsv_file): | ||
if (not tsv_file) or (not os.path.exists(tsv_file)): | ||
raise FileNotFoundError( | ||
f"ERROR: {tsv_file} does not exist. \nSOLUTION: Rename and gzip the features file to {FEATURE_FILE_NAME}\n" | ||
) | ||
df = pd.read_csv( | ||
tsv_file, | ||
sep="\t", | ||
on_bad_lines="skip", | ||
names=["gene_id", "gene_name", "type"], | ||
dtype=str, | ||
) | ||
gene_id = df["gene_id"].tolist() | ||
gene_name = df["gene_name"].tolist() | ||
# avoid adding extra \t to genes.tsv when all the gene_type are Nan | ||
# if gene_type is None and add to dataframe, will cause Seurat::Read error: Error in FUN(X[[i]], ...) : # # subscript out of bounds | ||
if df["type"].isnull().sum() == len(df["type"]): | ||
gene_type = None | ||
else: | ||
gene_type = df["type"].tolist() | ||
return cls(gene_id, gene_name, gene_type) | ||
|
||
def to_tsv(self, tsv_file): | ||
""" | ||
if gene_type is None and add to dataframe, will cause Seurat::Read10X error: Error in FUN(X[[i]], ...) : subscript out of bounds | ||
""" | ||
if self.gene_type: | ||
df = pd.DataFrame( | ||
{ | ||
"gene_id": self.gene_id, | ||
"gene_name": self.gene_name, | ||
"gene_type": self.gene_type, | ||
} | ||
) | ||
else: | ||
df = pd.DataFrame({"gene_id": self.gene_id, "gene_name": self.gene_name}) | ||
df.to_csv(tsv_file, sep="\t", index=False, header=False) | ||
|
||
|
||
def get_matrix_file_path(matrix_dir, file_name): | ||
""" | ||
compatible with non-gzip file | ||
""" | ||
non_gzip = file_name.strip(".gz") | ||
file_path_list = [f"{matrix_dir}/{file_name}", f"{matrix_dir}/{non_gzip}"] | ||
for file_path in file_path_list: | ||
if os.path.exists(file_path): | ||
return file_path | ||
|
||
|
||
class CountMatrix: | ||
def __init__(self, features: Features, barcodes: list, matrix): | ||
""" | ||
Args: | ||
features: Features object | ||
barcodes: list of barcodes | ||
matrix: scipy.sparse.coo_matrix | ||
""" | ||
self.__features = features | ||
self.__barcodes = barcodes | ||
self.__matrix = matrix | ||
self.shape = matrix.shape | ||
|
||
@staticmethod | ||
@utils.add_log | ||
def read_barcodes(matrix_dir): | ||
""" | ||
returns: barcodes set | ||
""" | ||
barcode_file = get_matrix_file_path(matrix_dir, BARCODE_FILE_NAME) | ||
barcodes, _ = utils.read_one_col(barcode_file) | ||
return set(barcodes) | ||
|
||
@classmethod | ||
@utils.add_log | ||
def from_matrix_dir(cls, matrix_dir): | ||
if not os.path.exists(matrix_dir): | ||
raise FileNotFoundError(f"{matrix_dir} does not exist") | ||
features_tsv = get_matrix_file_path(matrix_dir, FEATURE_FILE_NAME) | ||
features = Features.from_tsv(tsv_file=features_tsv) | ||
barcode_file = get_matrix_file_path(matrix_dir, BARCODE_FILE_NAME) | ||
barcodes = utils.read_one_col(barcode_file) | ||
matrix_path = get_matrix_file_path(matrix_dir, MATRIX_FILE_NAME) | ||
matrix = scipy.io.mmread(matrix_path) | ||
|
||
return cls(features, barcodes, matrix) | ||
|
||
@utils.add_log | ||
def to_matrix_dir(self, matrix_dir): | ||
os.mkdir(matrix_dir) | ||
self.__features.to_tsv(f"{matrix_dir}/{FEATURE_FILE_NAME}") | ||
pd.Series(self.__barcodes).to_csv(f"{matrix_dir}/{BARCODE_FILE_NAME}", index=False, sep="\t", header=False) | ||
matrix_path = f"{matrix_dir}/{MATRIX_FILE_NAME}" | ||
with gzip.open(matrix_path, "wb") as f: | ||
scipy.io.mmwrite(f, self.__matrix) | ||
|
||
@classmethod | ||
def from_dataframe(cls, df, features: Features, barcodes=None, value="UMI"): | ||
""" | ||
Use all gene_id from features even if it is not in df | ||
Args: | ||
df: count details dataframe with columns UMI and multi-index (column, row). | ||
features: Features | ||
barcodes: only keep barcode in barcodes | ||
value: value name in df, UMI | ||
""" | ||
if not barcodes: | ||
barcodes = df.index.levels[0].tolist() | ||
|
||
feature_index_dict = {} | ||
for index, gene_id in enumerate(features.gene_id): | ||
feature_index_dict[gene_id] = index | ||
|
||
barcode_index_dict = {} | ||
for index, barcode in enumerate(barcodes): | ||
barcode_index_dict[barcode] = index | ||
|
||
# use all barcodes | ||
barcode_codes = [barcode_index_dict[barcode] for barcode in df.index.get_level_values(level=0)] | ||
# use all gene_id from features even if it is not in df | ||
gene_id_codes = [feature_index_dict[gene_id] for gene_id in df.index.get_level_values(level=1)] | ||
mtx = scipy.sparse.coo_matrix( | ||
(df[value], (gene_id_codes, barcode_codes)), | ||
shape=(len(features.gene_id), len(barcodes)), | ||
) | ||
|
||
return cls(features, barcodes, mtx) | ||
|
||
def __str__(self): | ||
n_row, n_col = self.shape[0], self.shape[1] | ||
return f"CountMatrix object\n {n_row} x {n_col} coo_matrix" | ||
|
||
def __repr__(self): | ||
return self.__str__() | ||
|
||
def concat_by_barcodes(self, other): | ||
if other.get_barcodes() != self.get_barcodes(): | ||
raise ValueError("barcodes are not the same") | ||
|
||
if inter := set(self.get_features().gene_id).intersection(set(other.get_features().gene_id)): | ||
raise ValueError(f"deuplicated gene_id: {inter}") | ||
|
||
gene_id = self.get_features().gene_id + other.get_features().gene_id | ||
gene_name = self.get_features().gene_name + other.get_features().gene_name | ||
gene_type = None | ||
if self.get_features().gene_type and other.get_features().gene_type: | ||
gene_type = self.get_features().gene_type + other.get_features().gene_type | ||
features = Features(gene_id, gene_name, gene_type) | ||
|
||
matrix = scipy.sparse.vstack([self.get_matrix(), other.get_matrix()]) | ||
|
||
return CountMatrix(features, self.__barcodes, matrix) | ||
|
||
@utils.add_log | ||
def slice_matrix(self, slice_barcodes_indices): | ||
""" | ||
Args: | ||
slice_barcodes_indices: list of barcode indices | ||
Returns: | ||
CountMatrix object | ||
""" | ||
mtx_csc = self.__matrix.tocsc() | ||
slice_barcodes_indices.sort() | ||
sliced_mtx = mtx_csc[:, slice_barcodes_indices] | ||
barcodes = [self.__barcodes[i] for i in slice_barcodes_indices] | ||
return CountMatrix(self.__features, barcodes, sliced_mtx) | ||
|
||
@utils.add_log | ||
def slice_matrix_bc(self, bcs): | ||
""" | ||
Args: | ||
bcs: cell barcodes | ||
Returns: | ||
CountMatrix object | ||
""" | ||
barcodes_indices = [self.__barcodes.index(barcode) for barcode in bcs] | ||
barcodes_indices.sort() | ||
return self.slice_matrix(barcodes_indices) | ||
|
||
@utils.add_log | ||
def get_genes_fraction(self, gene_list): | ||
""" | ||
Returns: | ||
numpy 2d fraction of gene_names in gene_list | ||
""" | ||
gene_indices = [self.__features.gene_name.index(gene) for gene in gene_list] | ||
mtx = self.__matrix.tocsc() | ||
total = mtx.sum(axis=0) | ||
gene = mtx[gene_indices, :].sum(axis=0) | ||
f = gene / total | ||
|
||
return f | ||
|
||
def get_bc_geneNum(self): | ||
""" | ||
Returns {bc: geneNum}, total_genes | ||
""" | ||
gene_index, bc_index = self.__matrix.nonzero() | ||
total_genes = len(set(gene_index)) | ||
bc_gene = defaultdict(set) | ||
for gene, bc in zip(gene_index, bc_index): | ||
bc_gene[bc].add(gene) | ||
return {bc: len(bc_gene[bc]) for bc in bc_gene}, total_genes | ||
|
||
def get_barcodes(self): | ||
return self.__barcodes | ||
|
||
def get_features(self): | ||
return self.__features | ||
|
||
def get_matrix(self): | ||
return self.__matrix |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,6 +9,7 @@ | |
install_requires = [ | ||
"pysam", | ||
"pandas>=2.0.0", | ||
"scipy", | ||
] | ||
|
||
entrys = [] | ||
|