Skip to content

Commit

Permalink
added fast calculation of silhouetterank
Browse files Browse the repository at this point in the history
  • Loading branch information
bernard2012 committed Sep 8, 2020
1 parent 146bdf4 commit 9fe4e34
Show file tree
Hide file tree
Showing 6 changed files with 551 additions and 6 deletions.
7 changes: 6 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def run(self):

setuptools.setup(
name="silhouetteRank",
version="1.0.5.11",
version="1.0.5.13",
author="Qian Zhu",
author_email="[email protected]",
description="silhouetteRank is a tool for finding spatially variable genes based on computing silhouette coefficient from binarized spatial gene expression data",
Expand All @@ -55,6 +55,11 @@ def run(self):
"silhouette_rank_one = silhouetteRank.silhouette_rank_one:main",
"silhouette_rank_main = silhouetteRank.evaluate_2b:main",
"silhouette_rank_random = silhouetteRank.evaluate_exact_one_2b:main",
"silhouette_rank_random_batch = silhouetteRank.evaluate_exact_2b:main",
"slrank_fast = silhouetteRank.evaluate_fast_2b:main",
"slrank_random_fast = silhouetteRank.evaluate_fast_one_2b:main",
"slrank_prep_fast = silhouetteRank.prep_fast:main",
"slrank_combine_fast = silhouetteRank.combine_fast:main",
]
},
classifiers=(
Expand Down
187 changes: 187 additions & 0 deletions silhouetteRank/combine_fast.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
import math
import sys
import os
import re
import scipy
import scipy.stats
import numpy as np
from operator import itemgetter
import silhouetteRank
import logging
import argparse
import subprocess
def read(n):
f = open(n)
by_gene = {}
for l in f:
l = l.rstrip("\n")
ll = l.split()
gene = ll[0]
pval = float(ll[-2])
by_gene[gene] = pval
f.close()
return by_gene

def do_one(args):
result = subprocess.call("Rscript --version 2> /dev/null", shell=True)
if result==127:
sys.stderr.write("Rscript is not found\n")
sys.stderr.flush()
sys.exit(1)

outdir=args.input
log_file = "%s/%s" % (outdir, args.log_file)
logger = logging.getLogger("combine_fast")
logger.setLevel(logging.DEBUG)

if not logger.hasHandlers():
handler = logging.FileHandler(log_file)
handler.setFormatter(logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s"))
logger.addHandler(handler)
if args.verbose:
logger.addHandler(logging.StreamHandler())

logger.info("Entering %s..." % os.path.basename(args.input))

logger.info("Using existing input binaries...")
expr = np.load("%s/expr.npy" % args.input)
Xcen = np.load("%s/Xcen.npy" % args.input)
genes = np.load("%s/genes.npy" % args.input)

ncell = Xcen.shape[0]
param = {}
full_list = {}
for examine_top in args.examine_tops:
for rbp_p in args.rbp_ps:
target = int(ncell * examine_top)
dname = "%s/result_fast_sim_5000_%.2f_%.3f" % (args.input, rbp_p, examine_top)
if args.matrix_type=="dissim":
dname = "%s/result_fast_5000_%.2f_%.3f" % (args.input, rbp_p, examine_top)
f = open("%s/par.%d" % (dname, target))
n_scale = float(f.readline().rstrip("\n").split("\t")[1])
n_shape = float(f.readline().rstrip("\n").split("\t")[1])
f.close()
scores = []
f = open("%s/%d" % (dname, target))
for l in f:
l = l.rstrip("\n")
scores.append(float(l))
f.close()
scores = np.sort(np.array(scores))
param[(rbp_p, examine_top, target)] = (n_scale, n_shape, scores[-250])
full_list[(rbp_p, examine_top, target)] = scores

for examine_top in args.examine_tops:
for rbp_p in args.rbp_ps:
for trial in range(args.num_trials):
fname = "%s/silhouette.sim.fast.rbp.%.2f.top.%.3f.%d.txt" % (args.input, rbp_p, examine_top, trial)
if args.matrix_type=="dissim":
fname = "%s/silhouette.fast.rbp.%.2f.top.%.3f.%d.txt" % (args.input, rbp_p, examine_top, trial)
target = int(ncell * examine_top)
f = open(fname)
entries = []
by_size = {}
ids = {}
ind = 0
for l in f:
l = l.rstrip("\n")
ll = l.split("\t")
g = ll[1]
sc = float(ll[2])
n_scale, n_shape, n_exceed = param[(rbp_p, examine_top, target)]
if sc>n_exceed:
xx = math.pow(1.0 - n_shape * (sc - n_exceed) / n_scale, 1.0 / n_shape)
P = 0.05 * xx
else:
P = 0.01 * (100 - scipy.stats.percentileofscore(full_list[(rbp_p, examine_top, target)], sc))
entries.append([g, target, target, sc, P])
by_size.setdefault(target, [])
by_size[target].append(P)
ids.setdefault(target, [])
ids[target].append(ind)
ind+=1
f.close()

for targ in ids:
fw = open("/tmp/1", "w")
for i in by_size[targ]:
fw.write(str(i) + "\n")
fw.close()
os.system("Rscript %s/qval.R /tmp/1 /tmp/1.qval" % os.path.dirname(silhouetteRank.__file__))
s_scores = []
f = open("/tmp/1.qval")
for l in f:
l = l.rstrip("\n")
s_scores.append(float(l))
f.close()
for i1, i2 in zip(ids[targ], s_scores):
entries[i1].append(i2)
o_name = "%s/silhouette.sim.fast.rbp.%.2f.top.%.3f.%d.pval.txt" % (args.input, rbp_p, examine_top, trial)
if args.matrix_type=="dissim":
o_name = "%s/silhouette.fast.rbp.%.2f.top.%.3f.%d.pval.txt" % (args.input, rbp_p, examine_top, trial)
fw = open(o_name, "w")
for i1, i2, i3, i4, i5, i6 in entries:
fw.write("%s %s %s %s %s %s\n" % (str(i1), str(i2), str(i3), str(i4), str(i5), str(i6)))
fw.close()


by_gene = {}
for examine_top in args.examine_tops:
for rbp_p in args.rbp_ps:
for trial in range(args.num_trials):
fname = "%s/silhouette.sim.fast.rbp.%.2f.top.%.3f.%d.pval.txt" % (args.input, rbp_p, examine_top, trial)
if args.matrix_type=="dissim":
fname = "%s/silhouette.fast.rbp.%.2f.top.%.3f.%d.pval.txt" % (args.input, rbp_p, examine_top, trial)
by_gene[(examine_top, rbp_p, trial)] = read(fname)

all_genes = list(by_gene[(args.examine_tops[0], args.rbp_ps[0], 0)].keys())
score = {}
pval = {}
for g in all_genes:
score[g] = 0
tot_test = 0
for i in args.examine_tops:
for j in args.rbp_ps:
for k in range(args.num_trials):
score[g] += math.log(by_gene[(i, j, k)][g])
tot_test+=1
score[g] *= -2.0
pval[g] = np.exp(scipy.stats.chi2.logsf(score[g], tot_test*2))

score_it = list(score.items())
score_it.sort(key=itemgetter(1), reverse=True)
fw = open("/tmp/1.pval", "w")
for i,j in score_it:
fw.write(str(pval[i]) + "\n")
fw.close()

os.system("Rscript %s/qval.R /tmp/1.pval /tmp/1.qval" % os.path.dirname(silhouetteRank.__file__))
f = open("/tmp/1.qval")
q_score = []
for l in f:
l = l.rstrip("\n")
q_score.append(float(l))
f.close()

fw = open(args.output, "w")
for (i,j),k in zip(score_it, q_score):
fw.write("%s %s %s %s\n" % (str(i), str(j), str(pval[i]), str(k)))
fw.close()

if __name__=="__main__":
parser = argparse.ArgumentParser(description="combine.py: combine spatial scores across parameters", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("-i", "--input", dest="input", type=str, required=True, help="input directory (should be whatever that contains result_5000)")
#parser.add_argument("-j", "--input-random", dest="input_random", type=str, required=True, help="input random silhouette scores (for random patterns)")
parser.add_argument("-o", "--output", dest="output", type=str, required=True, help="output file name")
#parser.add_argument("-u", "--output-dir", dest="outdir", type=str, help="output directory containing binary files", default=".")
parser.add_argument("-l", "--log-filename", dest="log_file", type=str, default="master.combine.fast.log", help="log file name (no path), will be generated in same directory as --output")
parser.add_argument("-v", "--verbose", dest="verbose", action="store_true", help="print verbose messages to console")

parser.add_argument("-r", "--rbp-ps", dest="rbp_ps", nargs="+", type=float, default=[0.95, 0.99], help="p parameter of RBP")
parser.add_argument("-e", "--examine-tops", dest="examine_tops", nargs="+", type=float, default=[0.005, 0.010, 0.050, 0.100, 0.300], help="top proportion of cells per gene to be 1's (expressed)")
parser.add_argument("-m", "--matrix-type", dest="matrix_type", type=str, choices=["sim", "dissim"], help="whether to calculate similarity matrix or dissimilarity matrix", default="dissim")
parser.add_argument("-t", "--num-trials", dest="num_trials", type=int, default=1, help="number of trials")
#parser.add_argument("-i", "--input-dir", dest="input", type=str, default=".", help="input directory containing individual spatial score rankings (to be aggregated)")
#parser.add_argument("-o", "--output", dest="output", type=str, required=True, help="output file name")
args = parser.parse_args()
do_one(args)
130 changes: 130 additions & 0 deletions silhouetteRank/evaluate_fast_2b.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
import sys
import os
import re
import numpy as np
import math
import scipy
import silhouetteRank
import silhouetteRank.spatial_genes as spatial_genes
from operator import itemgetter
from scipy.spatial.distance import squareform, pdist
from scipy.stats import percentileofscore
from sklearn.metrics import roc_auc_score
import argparse
import logging

def read(f_expr="expression.txt", f_Xcen="Xcen.good", logger=logging):
logger.info("Reading gene expression...")
f = open(f_expr)
h = f.readline().rstrip("\n").split("\t")[1:]
num_cell = len(h)
num_gene = 0
for l in f:
l = l.rstrip("\n")
num_gene+=1
f.close()

expr = np.empty((num_gene, num_cell), dtype="float32")
genes = []
f = open(f_expr)
f.readline()
ig = 0
for l in f:
l = l.rstrip("\n")
ll = l.split("\t")
gene = ll[0]
genes.append(gene)
expr[ig,:] = [float(v) for v in ll[1:]]
ig+=1
f.close()
logger.info("Reading cell coordinates...")
f = open(f_Xcen)
Xcen = np.empty((num_cell, 2), dtype="float32")
f.readline()
ic = 0
for l in f:
l = l.rstrip("\n")
ll = l.split("\t")
Xcen[ic,:] = [float(ll[1]), float(ll[2])]
ic+=1
f.close()
return expr, Xcen, genes

def do_one(args):
matrix_type = args.matrix_type
rbp_p = args.rbp_p

if not os.path.isdir(args.output):
os.mkdir(args.output)

logdir="%s/logs.fast" % args.output
if not os.path.isdir(logdir):
os.mkdir(logdir)
log_file = "%s/real_%.2f_%.3f.out" % (logdir, args.rbp_p, args.examine_top)
logger = logging.getLogger("real_fast_%.2f_%.3f" % (args.rbp_p, args.examine_top))
logger.setLevel(logging.DEBUG)

if not logger.hasHandlers():
handler = logging.FileHandler(log_file, "w")
handler.setFormatter(logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s"))
logger.addHandler(handler)
if args.verbose:
logger.addHandler(logging.StreamHandler())

t_overwrite = args.overwrite_input_bin
check_required = ["expr.npy", "Xcen.npy", "genes.npy", "t_matrix_%s_%.2f.npy" % (args.matrix_type, args.rbp_p)]
for cr in check_required:
if not os.path.isfile("%s/%s" % (args.output, cr)):
t_overwrite = True
break

expr, Xcen, genes, t_matrix = None, None, None, None
if t_overwrite:
expr, Xcen, genes = read(f_expr=args.expr, f_Xcen=args.centroid, logger=logger)
logger.info("Calculate all pairwise Euclidean distance between cells using their physical coordinates")
euc = squareform(pdist(Xcen, metric="euclidean"))
logger.info("Rank transform euclidean distance, and then apply exponential transform")
t_matrix = spatial_genes.rank_transform_matrix(euc, reverse=False, rbp_p=rbp_p, matrix_type=matrix_type, logger=logger)
np.save("%s/t_matrix_%s_%.2f.npy" % (args.output, args.matrix_type, args.rbp_p), t_matrix)
np.save("%s/expr.npy" % args.output, expr)
np.save("%s/Xcen.npy" % args.output, Xcen)
np.save("%s/genes.npy" % args.output, genes)
else:
logger.info("Using existing input binaries...")
expr = np.load("%s/expr.npy" % args.output)
Xcen = np.load("%s/Xcen.npy" % args.output)
genes = np.load("%s/genes.npy" % args.output)
t_matrix = np.load("%s/t_matrix_%s_%.2f.npy" % (args.output, args.matrix_type, args.rbp_p))

logger.info("Compute silhouette metric per gene using fast method")
examine_top = args.examine_top

for t_trial in range(args.num_trials):
res = spatial_genes.calc_silhouette_per_gene_approx(genes=genes, expr=expr, matrix=t_matrix, matrix_type=matrix_type, examine_top=examine_top, logger=logger)
if matrix_type=="sim":
f_name = "%s/silhouette.sim.fast.rbp.%.2f.top.%.3f.%d.txt" % (args.output, rbp_p, examine_top, t_trial)
else:
f_name = "%s/silhouette.fast.rbp.%.2f.top.%.3f.%d.txt" % (args.output, rbp_p, examine_top, t_trial)
fw = open(f_name, "w")
for ind,v in enumerate(res):
fw.write("%d\t%s\t%.10f\n" % (ind, v[0], v[1]))
fw.close()

def main():
parser = argparse.ArgumentParser(description="evaluate.2b.py: calculate silhouette score for spatial patterns", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("-x", "--file-expr", dest="expr", type=str, required=True, help="expression matrix. Will use input binary expr.npy (if exists) to speed up reading.")
parser.add_argument("-c", "--file-centroid", dest="centroid", type=str, required=True, help="cell coordinate. Will use input binary Xcen.npy (if exists) to speed up reading.")
parser.add_argument("-w", "--overwrite-input-binary", dest="overwrite_input_bin", action="store_true", help="overwrite input binaries")
parser.add_argument("-e", "--examine-top", dest="examine_top", type=float, default=0.05, help="top proportion of cells per gene to be 1's (expressed)")

parser.add_argument("-r", "--rbp-p", dest="rbp_p", type=float, default=0.95, help="p parameter of RBP")
parser.add_argument("-m", "--matrix-type", dest="matrix_type", type=str, choices=["sim", "dissim"], help="whether to calculate similarity matrix or dissimilarity matrix", default="dissim")
parser.add_argument("-o", "--output-dir", dest="output", type=str, default=".", help="output directory")
parser.add_argument("-t", "--num-trials", dest="num_trials", type=int, default=1, help="number of trials")
parser.add_argument("-v", "--verbose", dest="verbose", action="store_true", help="print verbose messages to console")

args = parser.parse_args()
do_one(args)

if __name__=="__main__":
main()
Loading

0 comments on commit 9fe4e34

Please sign in to comment.