Skip to content

Commit

Permalink
ready for resubmission
Browse files Browse the repository at this point in the history
  • Loading branch information
moritzbuck committed Nov 5, 2021
1 parent fe6d228 commit 9add954
Show file tree
Hide file tree
Showing 9 changed files with 128 additions and 45 deletions.
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# mOTUlizer



**DISCLAIMER, there is an other tool out there called mOTUs that creates OTU-tables directly from reads, if you are looking for that tool, this is the wrong page, you want to go ['here'](https://motu-tool.org/), but while you on my page, why don't you check out mOTUlizer, it's cool, I swear**

Utility to analyse a group of closely related MAGs/Genomes/bins/SUBs of more or less dubious origin. Right now it is composed of a number of programs:

* `mOTUlize.py` takes a set of genomes (I will use the term genome as a short hand for set of nucleotide sequences that presumably come from the same organism/population, can be incomplete, redundant or contaminated) and cluster them in to metagenomic Operational Taxonomic Units (mOTUs). Using similarity scores (by default ANI as computed by fastANI, but user can provide other similarities) a network is built based on (user defined) better quality genomes (for historical reasons called MAGs) by thresholding the similarities at a specific value (95% by default). The connected components of this graph are the mOTUs. Additionally lower quality genomes (SUBs, ) are recruited to the mOTU of whichever MAG they are most similar too if the similarity is above the threshold.
Expand Down
21 changes: 15 additions & 6 deletions mOTUlizer/bin/mOTUconvert.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,25 +40,34 @@

def motuconvert(args):

kwargs = dict()
if args.gene2genome:
print("Parsing the --gene2genome file", file = sys.stderr )
with open(args.gene2genome) as handle:
kwargs['gene_id2genome'] = {l.split("\t")[0] : l.split("\t")[1].strip().split(";") for l in handle}

method = args.in_type
if method == "emapper":
converter = EmapperParse()
converter = EmapperParse(**kwargs)
elif method == "ppanggolin":
converter = PPanGGolinParse()
converter = PPanGGolinParse(**kwargs)
elif method == "roary":
converter = RoaryParse()
converter = RoaryParse(**kwargs)
elif method == "mmseqs2":
converter = MmseqsParse()
converter = MmseqsParse(**kwargs)
elif method == "anvio":
converter = AnvioParse()
converter = AnvioParse(**kwargs)
else :
print("This is not implemented yet run with '--list' to see available options", file = sys.stderr )
sys.exit(0)


print("Doing the convertion", file = sys.stderr )

genome2gene_clusterss = converter.convert(infile = args.input, count = args.count)

if args.output:
out_handle = open(out_json, "w")
out_handle = open(args.output, "w")
else :
out_handle = sys.stdout

Expand Down
15 changes: 14 additions & 1 deletion mOTUlizer/bin/mOTUpan.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,10 +112,23 @@ def motupan(args):
out_handle = sys.stdout
stats = motu.get_stats()


if args.checkm:
abs = lambda x : x if x > 0 else -x
mean = lambda l : sum(l)/len(l)
mean_complet_diff = mean([k.checkm_complet - k.new_completness for k in motu])
max_complete_diff = max([k.checkm_complet - k.new_completness for k in motu])
if mean_complet_diff > 5:
print(f"**WARNING** : the mean difference between you prior and posterior completeness estimates is pretty high ({mean_complet_diff:.2f} %), This doesn't have to be a problem, but it could be due to a highly unbalanced set of genomes (e.g. many of one strain and few of an other), some genomes in the input set that shouldn't be there, or maybe your gene-clusters are too 'strict'...", file = sys.stderr)
if max_complete_diff > 60:
print(f"**WARNING** : the largest difference between you prior and posterior completeness estimates is pretty high ({max_complete_diff:.2f} %), This doesn't have to be a problem, but it could be due to a highly unbalanced set of genomes (e.g. many of one strain and few of an other), some genomes in the input set that shouldn't be there, or maybe your gene-clusters are too 'strict'...", file = sys.stderr)

nb_boots = args.boots

stats[name].update(motu.roc_values(boots=nb_boots))
print(motu.roc_values(boots=nb_boots))
if args.long and not args.genome2gene_clusters_only:

stats[name].update(motu.roc_values(boots=nb_boots))
json.dump(stats, out_handle, indent=4, sort_keys=True)
elif args.genome2gene_clusters_only :
json.dump({k : list(v) for k,v in motu.gene_clusters_dict.items()}, out_handle, indent=4, sort_keys=True)
Expand Down
4 changes: 2 additions & 2 deletions mOTUlizer/classes/COGs.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,10 +98,10 @@ def compute_COGs(faas, name, precluster = False, threads = 4, method = "mmseqsC
genome2gene_clusters[vv].update([v])

elif method == "mmseqsCluster" :
"coverage = 80% with cov-mode = 0, minimal amino acid sequence identity = 80% and cluster-mode = 0"
"coverage = 80% with cov-mode = 0, minimal amino acid sequence identity = 0% and cluster-mode = 0"
covmode = 0
cov = 0.80
seqid = 0.80
seqid = 0.0
if not shutil.which("mmseqs"):
print("You need mmseqs2 to run the silix gene-clustering, either install it or run mOTUpan with an other gene-clustering or your own traits", file = sys.stderr)
sys.exit(-1)
Expand Down
5 changes: 3 additions & 2 deletions mOTUlizer/classes/MockData.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,9 @@ def __init__(self, name, core_len, nb_genomes, completeness, max_it = 20, access
self.recall = len(core.intersection(self.core))/core_len
if len([not c.startswith("CoreTrait_") for c in self.core]) != 0:
self.fpr = sum([not c.startswith("CoreTrait_") for c in self.core])/len([not c.startswith("CoreTrait_") for c in self.core])
else self.fpr = 1

else :
self.fpr = 1

self.lowest_false = {k : v for k,v in self.gene_clustersCounts.items() if k in self.core and k not in core}
self.lowest_false = 1 if(len(self.lowest_false) ==0) else min(self.lowest_false.items(), key = lambda x : x[1])[1]/len(self)

Expand Down
16 changes: 9 additions & 7 deletions mOTUlizer/classes/Parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
class Parser(metaclass=abc.ABCMeta):

def __init__(self, **kwargs):
if 'gene_id2genome' in kwargs and gene_id2genome:
if type(gene_id2genome) == dict:
self.gene_id2genome = gene_id2genome
if 'gene_id2genome' in kwargs and kwargs['gene_id2genome']:
if type(kwargs['gene_id2genome']) == dict:
self.gene_id2genome = kwargs['gene_id2genome']
else :
print("TODO")
sys.exit(0)
Expand Down Expand Up @@ -193,11 +193,13 @@ def convert(self, infile, count = False):

print("Stratify it to genome", file = sys.stderr)
if not self.gene_id2genome:
self.gene_id2genome = {k : "_".join(k.split("_")[:-1]) for k in gene_id2family.keys()}
self.gene_id2genome = {k : ["_".join(k.split("_")[:-1])] for k in gene_id2family.keys()}

genome2family = {k : [] for k in set(self.gene_id2genome.values())}
for k,v in gene_id2family.items():
genome2family[self.gene_id2genome[k]] += [v]
genome2family = { kk : [] for k in self.gene_id2genome.values() for kk in k}

for k,v in tqdm(gene_id2family.items()):
for vv in self.gene_id2genome[k]:
genome2family[vv] += [v]
if count:
return {k : {vv : v.count(vv) for vv in set(v)} for k,v in genome2family.items()}
else :
Expand Down
2 changes: 1 addition & 1 deletion mOTUlizer/classes/mOTU.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from mOTUlizer.classes.MetaBin import MetaBin
from mOTUlizer.classes.MetaBin import MetaBin
from mOTUlizer.classes.COGs import *
import subprocess
import tempfile
Expand Down
86 changes: 61 additions & 25 deletions mOTUlizer/scripts/prochloros.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,6 +380,14 @@ def running_tools():
python ../../mOTUlizer/bin/mOTUconvert.py --in_type ppanggolin static_data/ppanggolin_clusters_goods.h5 > static_data/ppanggolin_goods.gid2gene_clusters
ppanggolin annotate --anno nucleotides/s__Prochlorococcus_A_clusters.txt -o static_data/ --basename ppanggolin_clusters_pAs --use_pseudo --tmpdir . -c 20 -f
ppanggolin cluster -p static_data/ppanggolin_clusters_goods.h5 -c 20 --tmpdir .
ppanggolin graph -p static_data/ppanggolin_clusters_goods.h5 -c 20
ppanggolin partition -K3 -f -p static_data/ppanggolin_clusters_goods.h5 --cpu 20
python ../../mOTUlizer/bin/mOTUconvert.py --in_type ppanggolin static_data/ppanggolin_clusters_goods.h5 > static_data/ppanggolin_goods.gid2gene_clusters
mkdir other_soft/roary/s__Prochlorococcus_A
roary -p 22 -o other_soft/roary/s__Prochlorococcus_A_clusters.txt -cd 1 -v `cat nucleotides/gff_list | cut -f2`
cp accessory* blast_identity_frequency.Rtab core_accessory* gene_presence_absence.* number_of_* summary_statistics.txt other_soft/roary/s__Prochlorococcus_A
Expand Down Expand Up @@ -420,13 +428,14 @@ def run_motupan(genomes, gid2gene_clusters, name = "test" , k=15):
gene_clusters_dict = {k : gid2gene_clusters.get(k,gid2gene_clusters[k.split("#")[0]]) for k in kks}
checkm_loc = {g : checkm.get(g,checkm[g.split("#")[0]])['Completeness'] for g in kks}
motu = None
motu = mOTU( name = name , faas = faas_loc, gene_clusters_dict = gene_clusters_dict, genome_completion_dict = checkm_loc, max_it = 20, threads = 20, precluster = True, method = "default")
motu = mOTU( name = name , faas = faas_loc, gene_clusters_dict = gene_clusters_dict, genome_completion_dict = checkm_loc, max_it = 20, threads = 20, precluster = True, method = "default", quiet = True)
stats = motu.get_stats()
# roc = motu.roc_values()
roc = motu.roc_values(1)
eff_genomes = sum([v for v in checkm_loc.values()])/100
new_eff = sum([g.new_completness for g in motu])/100
out = { 'nb_genomes' : k, 'core_len' : len(stats['test']['core']), 'aux_len' : len(stats['test']['aux_genome']), "checkm_eff" : eff_genomes, "motu_eff" : new_eff, 'mean_new_complete' : mean([g.new_completness for g in motu]) }
# out.update(roc)
out.update(roc)
out['core'] = motu.core
return out

def get_data(g):
Expand Down Expand Up @@ -558,29 +567,56 @@ def roary_vs_motupan():
with open("analyses/roary_rarefaction.csv", "w") as handle:
handle.writelines([",".join(head) + "\n"] + [ ",".join([str(dd[k]) for k in head]) + '\n' for dd in boots])

with open("static_data/ppanggolin_goods.gid2gene_clusters") as handle:
with open("static_data/ppanggolin_species.gid2cog") as handle:
ppanggolin_gi2gene_clusters = {k : set(v) for k, v in json.load(handle).items()}
boots = []
for i in tqdm(range(3, len(ppanggolin_gi2gene_clusters))):
for j in range(reps):
sub_gid2gene_clusters = {k : ppanggolin_gi2gene_clusters[k] for k in choices(list(ppanggolin_gi2gene_clusters), k=i)}
est_complete = mean([checkm[k]['Completeness'] for k in sub_gid2gene_clusters])
tt = pange_dict2roary_classes(sub_gid2gene_clusters)
tt['nb_org'] = i
tt['rep'] = j
tt['method'] = "strict"
tt['est_checkm_complete'] = est_complete
motupan = run_motupan(list(sub_gid2gene_clusters.keys()),k=i, gid2gene_clusters = sub_gid2gene_clusters)

tt['motupan_est_checkm'] = motupan['mean_new_complete']
tt['motupan_core'] = motupan['core_len']
tt['motupan_cloud'] = motupan['aux_len']

boots += [tt]

head = list(boots[0].keys())
with open("analyses/ppanggolin_rarefaction.csv", "w") as handle:
handle.writelines([",".join(head) + "\n"] + [ ",".join([str(dd[k]) for k in head]) + '\n' for dd in boots])
boots = []
fulls = {k : ppanggolin_gi2gene_clusters[k] for k in ppanggolin_gi2gene_clusters if k in full_p_As}
bests = run_motupan(full_p_As, k = len(full_p_As), gid2gene_clusters = fulls)
best_cores = []
for i in tqdm(range(10)):
sub_gid2gene_clusters = {k : ppanggolin_gi2gene_clusters[k] for k in choices(list(ppanggolin_gi2gene_clusters), k=200)}
motupan = run_motupan(list(sub_gid2gene_clusters.keys()),k=300, gid2gene_clusters = sub_gid2gene_clusters)
best_cores += [motupan['core']]

soft_core = set.union(*best_cores)
best_core = bests['core']

def prepar_motu(nb_mags, rep):
i = nb_mags
j = rep
sub_gid2gene_clusters = {k : ppanggolin_gi2gene_clusters[k] for k in choices(list(ppanggolin_gi2gene_clusters), k=i)}
est_complete = mean([checkm[k]['Completeness'] for k in sub_gid2gene_clusters])
tt = pange_dict2roary_classes(sub_gid2gene_clusters)
tt['nb_org'] = i
tt['rep'] = j
tt['method'] = "strict"
tt['est_checkm_complete'] = est_complete
motupan = run_motupan(list(sub_gid2gene_clusters.keys()),k=i, gid2gene_clusters = sub_gid2gene_clusters)
tt['motupan_est_checkm'] = motupan['mean_new_complete']
tt['motupan_core'] = motupan['core_len']
tt['motupan_cloud'] = motupan['aux_len']
tt['bootstrapped_fpr'] = motupan['mean_fpr']
tt['lowest_false'] = motupan['mean_lowest_false']
tt['recall'] = motupan['mean_recall']
tt['empirical_fpr'] = len(motupan['core'].difference(soft_core))/len(soft_core)
return tt

import multiprocessing as mp
pool = mp.Pool(mp.cpu_count())

results2 = []
for j in range(20):
results2 += pool.starmap_async(prepar_motu, [ (i, j) for i in range(3,len(ppanggolin_gi2gene_clusters))]).get()
pandas.DataFrame.from_records(results2).to_csv("analyses/motupan_rarefact_w_ppanggolin_cogs_3.tsv")
print(f"========== Done rep {j} ========")

# for j in tqdm(range(reps)):
# for i in tqdm(range(3, len(ppanggolin_gi2gene_clusters))):
# boots += [tt]

# head = list(boots[0].keys())
# with open("analyses/ppanggolin_rarefaction.csv", "w") as handle:
# handle.writelines([",".join(head) + "\n"] + [ ",".join([str(dd[k]) for k in head]) + '\n' for dd in boots])



Expand Down
20 changes: 19 additions & 1 deletion mOTUlizer/scripts/prochlos_figs.R
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
library(data.table)
library(pheatmap)
library(RColorBrewer)
library('philentropy')
#library('philentropy')
library(ggplot2)
library(ggrepel)
library(ggpubr)
library(scales)

dd = fread("analyses/motupan_species_rarefact_w_roary_cogs.tsv")
dd[, tool := "mOTUpan"]
Expand Down Expand Up @@ -211,3 +213,19 @@ ggplot(meted[sample(nrow(meted))], aes(x=mean_completeness_ppan, y=value/mean_es
xlab('mean completeness')+ylab('Fraction of core COGs')#+ylim(0,1100)

ggsave("~/temp/motupan_vs_ppanggolin_core_fract.pdf", width = 7, height = 6)

rar_new = fread("analyses/motupan_rarefact_w_ppanggolin_cogs_3.tsv")
tt = rar_new$empirical_fpr > 0 & rar_new$bootstrapped_fpr > 0
rar_new = rar_new[tt]
mm = melt(rar_new[,c("nb_org", "empirical_fpr", "bootstrapped_fpr")], id.vars = "nb_org")

g1 = ggplot(rar_new, aes(x=empirical_fpr, y=bootstrapped_fpr, col=nb_org))+geom_point()+
geom_density2d()+geom_smooth(method="lm", se = FALSE, col="purple", size=1.5)+
geom_abline(col="red", size=1.5)+theme_minimal()+scale_y_log10()+scale_x_log10()+scale_color_gradientn(colors = c("#91bfdb", "#ffffbf", "#fc8d59"), trans = "log10", name = "Genome count:")+
xlab("Empirical fpr")+ylab("Bootstrapped fpr")
g2 = ggplot(mm, aes(x=nb_org, y=value, col=variable))+geom_point(alpha=0.3)+geom_smooth(se = FALSE, size=1.5)+scale_y_log10()+theme_minimal()+
scale_color_manual(values = c(muted("#1b9e77"), muted("#d95f02")), labels = c("Empirical", "Bootstrapped"), name = "fpr type:")+
xlab("Genome count")+ylab("False positive rate")

ggarrange(g1,g2,labels = c("[A]", "[B]"), ncol=2)
ggsave("analyses/sup_fig.pdf", width = 12, height = 8)

0 comments on commit 9add954

Please sign in to comment.