Skip to content

Commit

Permalink
Merge branch 'dev'
Browse files Browse the repository at this point in the history
  • Loading branch information
moritzbuck committed Sep 2, 2020
2 parents 5144683 + 7d2f7e9 commit 1544306
Show file tree
Hide file tree
Showing 13 changed files with 25,539 additions and 62 deletions.
2 changes: 1 addition & 1 deletion mOTUlizer/bin/mOTUlize.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ def main(args):
parser.add_argument('--MAG-completeness', '--MC', '-M', nargs = '?', type=float, default = 40, help = "completeness cutoff for seed MAGs, default : 40")
parser.add_argument('--MAG-contamination', '--Mc', '-m', nargs = '?', type=float, default = 5, help = "contamination cutoff for seed MAGs, default : 5")
parser.add_argument('--SUB-completeness', '--SC', '-S', nargs = '?', type=float, default = 0, help = "completeness cutoff for recruited SUBs, default : 0")
parser.add_argument('--SUB-contamination', '--Sc', '-s', nargs = '?', type=float, default = 100, help = "contamination cutoff for recruited SUBs, default : 0")
parser.add_argument('--SUB-contamination', '--Sc', '-s', nargs = '?', type=float, default = 100, help = "contamination cutoff for recruited SUBs, default : 100")
parser.add_argument('--similarity-cutoff', '-i', nargs = '?', type=float, default = 95, help = "distance cutoff for making the graph, default : 95")
parser.add_argument('--cpus', '-c', nargs = '?', type=int, default = 1, help = "number of threads, default : 1")
parser.add_argument('--keep-simi-file', '-K', nargs = '?', default = None, help = "keep generated similarity file if '--similarities' is not procided")
Expand Down
30 changes: 22 additions & 8 deletions mOTUlizer/bin/mOTUpan.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#print("This is temporary, fix the hard-path once all is clean", file=sys.stderr)
sys.path.append("/home/moritz/projects/0039_mOTUlizer/")

import mOTUlizer
from mOTUlizer.classes import *
from mOTUlizer.utils import *
from mOTUlizer.classes.mOTU import mOTU
Expand All @@ -23,9 +24,13 @@
Returns all to stdout by default.
"""

__checkm_default__ = 95

def main(args):
if args.version:
print("{script} Version {version}".format(script = __file__, version = mOTUlizer.__version__))
sys.exit()

if args.cog_file:
try :
if args.cog_file.endswith(".json") or args.cog_file.endswith(".gid2cog"):
Expand Down Expand Up @@ -55,8 +60,12 @@ def main(args):
assert all([os.path.exists(f) for f in faas.values()]), "one or some of your faas don't exists"


genomes = set(faas.keys()).intersection(set(cog_dict.keys()))
print(len(genomes))
if len(faas) > 0:
genomes = set(faas.keys()).intersection(set(cog_dict.keys()))
else :
genomes = set(cog_dict.keys())
print("len cog dict" , len(cog_dict))

if cog_dict and len(faas) > 0:
if len(genomes) != len(faas) or len(faas) != len(cog_dict):
print("your faas and cog_drct are not the same length,\nit might not matter just wanted to let you know.", file = sys.stderr)
Expand All @@ -71,8 +80,10 @@ def main(args):

print("Parsing the checkm-file")
checkm = {k : v['Completeness'] for k,v in parse_checkm(args.checkm).items()}
checkm = {g : checkm[g] for g in genomes}

checkm = {g : checkm.get(g) for g in genomes}
if not all([v for v in checkm.values()]):
print("you did not have completeness for all bins/genomes, abscent values have been put to default value, e.g. " + str(__checkm_default__))
checkm = {k : v if v else __checkm_default__ for k,v in checkm.items()}
if args.seed :
for f in genomes:
checkm[f] = args.seed
Expand All @@ -88,14 +99,16 @@ def main(args):
if faas is None and cogs is None:
sys.exit("at least one of --faas and --cog_file is required")

print(len(genomes), " genomes in your clade")

motu = mOTU( name = name , faas = faas , cog_dict = cog_dict, checkm_dict = checkm, max_it = max_it)

if args.output:
out_handle = open(out_json, "w")
else :
out_file = sys.stdout
if not args.genome2cog_only:
json.dump(motu.get_stats(), out_handle)
json.dump(motu.get_stats(), out_handle, indent=4, sort_keys=True)
else :
json.dump({k : list(v) for k,v in motu.cog_dict.items()}, out_handle)
if args.output:
Expand All @@ -104,11 +117,11 @@ def main(args):
return None

if __name__ == "__main__":
parser = argparse.ArgumentParser()#prog = "mOTUlizer", description=description_text, epilog = "Let's do this")
parser = argparse.ArgumentParser(prog = "mOTUpan.py", description=description_text, epilog = "Let's do this")
parser.add_argument('--output', '-o', nargs = '?', help = "send output to this file")
parser.add_argument('--force', '-f', action='store_true', help = "force execution answering default answers")
parser.add_argument('--checkm', '-k',nargs = '?', help = "checkm file if you want to see completnesses with it")
parser.add_argument('--seed', '-s', type = float , nargs = '?', help = "seed completeness, advice a number around 90 (95 default)")
parser.add_argument('--seed', '-s', type = float , nargs = '?', help = "seed completeness, advice a number around 90 ({} default)".format(__checkm_default__))
parser.add_argument('--length_seed', '--ls', action='store_true', help = "seed completeness by length fraction [0-100]")
parser.add_argument('--random_seed', '--rs', action='store_true', help = "random seed completeness between 5 and 95 percent")
parser.add_argument('--genome2cog_only', action='store_true', help = "returns genome2cog only")
Expand All @@ -117,6 +130,7 @@ def main(args):
parser.add_argument('--cog_file', '--cogs', '-c', nargs = '?', help = "file with COG-sets (see doc)")
parser.add_argument('--name', '-n', nargs = '?', help = "if you want to name this bag of bins")
parser.add_argument('--max_iter', '-m', nargs = '?', type = int , default = 20 , help = "if you want to name this bag of bins")
parser.add_argument('--version','-v', action="store_true", help = "get version")

if len(sys.argv)==1:
parser.print_help(sys.stderr)
Expand Down
2 changes: 1 addition & 1 deletion mOTUlizer/classes/MetaBin.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ class MetaBin:
def __repr__(self) :
return "< bin {name} with {n} cogs>".format(n = len(self.cogs) if cogs else "NA", name = self.name)

def __init__(self, name, cogs,fnas, faas, complet = None, contamin = 0, max_complete = 95):
def __init__(self, name, cogs,fnas, faas, complet = None, contamin = 0, max_complete = 99.9):
self.name = name
self.cogs = cogs
self.faas = faas
Expand Down
99 changes: 99 additions & 0 deletions mOTUlizer/classes/MockData.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
from random import random, sample
from mOTUlizer.classes.mOTU import mOTU, mean
from collections import defaultdict
from tqdm import tqdm
from numpy.random import normal
from numpy import floor, mean

genome2guass = {}

with open("mOTUlizer/data/Efeacalis.csv") as handle:
emp_disp = [int(l[:-1].split(",")[-1])/691 for l in handle if "accessory" in l]
gene_count = sum(emp_disp)

class MockmOTU(mOTU):
def __repr__(self) :
return "< MockmOTU with {n} genomes, of average {c}% completness, with core/genome_len of {r} >".format(c = 100*self.mean_completeness, n = len(self), r = self.ratio)

def __init__(self, name, core_len, nb_genomes, completeness, max_it = 20):

core = {"CoreTrait_{}".format(i) for i in range(core_len)}


sub_dist = [int(nb_genomes/i) for i in range(2,1000) if int(nb_genomes/i) > 0] + [1]*100
sub_dist = list(range(nb_genomes-1, 1,-1))

self.size_accessory = sum(sub_dist)
self.mean_size_accessory = sum(sub_dist)/nb_genomes

mock_genomes = dict()
for k in range(nb_genomes):
mock_genomes["Genome_{}".format(k)] = list(core)

for i,v in enumerate(sub_dist):
genomes = sample(list(mock_genomes.keys()), v)
for g in genomes:
mock_genomes[g] += ["AccessoryTrait_{}".format(i)]

self.incompletes = {g : {vv for vv in v if random() < (completeness(g)/100)} for g, v in mock_genomes.items()}

for k, v in self.incompletes.items():
if len(v) == 0:
choice(core)

self.mean_completeness = mean([len({vv for vv in v if vv.startswith("CoreTrait_")})/core_len for c,v in self.incompletes.items()])
self.completenesses = {c : 100*len({vv for vv in v if vv.startswith("CoreTrait_")})/core_len for c,v in self.incompletes.items()}
# self.accessory = accessory
self.mean_size = mean([len(m) for m in mock_genomes.values()])
self.real_core_len = core_len

zerifneg = lambda g: 0.001 if g < 0 else g
super().__init__(name = name, faas = {}, cog_dict = self.incompletes, checkm_dict = { k : zerifneg(normal(v, 10)) for k,v in self.completenesses.items()}, max_it = max_it)
self.recall = len(core.intersection(self.core))/core_len
self.lowest_false = {k : v for k,v in self.cogCounts.items() if k in self.core and k not in core}
self.lowest_false = 1 if(len(self.lowest_false) ==0) else min(self.lowest_false.items(), key = lambda x : x[1])[1]/len(self)

def mock_cog_stats(self):
all_genes = set.union(*self.incompletes.values())
outp = {t : {} for t in all_genes}
for t,dd in outp.items():
dd['freq'] = sum([t in zz for zz in self.incompletes.values()])/len(self.incompletes)
dd['core'] = t in self.core
dd['type'] = "core" if t.startswith("CoreTrait_") else "accessory"
dd['nb_genomes'] = len(self)
dd['core_len'] = len(self.core)
dd['real_core_len'] = self.read_core_len
dd['llikelihood'] = self.likelies[t]
dd['len_accessory_genome'] = len(all_genes) - dd['real_core_len']
return outp

@classmethod
def guauss_completes(cls, g, mean_completeness = 60, stdev = 10):
if g in genome2guass:
return genome2guass[g]
else :
out_prob = 1000
while(not (20 < out_prob < 99) ):
out_prob = normal(mean_completeness, stdev)
genome2guass[g] = out_prob
return out_prob

@classmethod
def run_boots(cls):
out = {}

for i in range(1000, 2500, 500):
for nb in range(10, 250, 15):
for c in range(30, 100, 5):
MockData.genome2guass = {}
mockmotu = MockmOTU("complete_{}_core_size_{}_nbgenomes_{}".format(c,i,nb).format(c), i, nb, lambda g : MockmOTU.guauss_completes(g, mean_completeness = c, stdev = 10), max_it = 100)
out[mockmotu.name] = {}
out[mockmotu.name]['core_size'] = i
out[mockmotu.name]['nb_genomes'] = nb
out[mockmotu.name]['mean_completeness'] = mockmotu.mean_completeness
out[mockmotu.name]['mean_genome_size'] = mockmotu.mean_size
out[mockmotu.name]['recall'] = mockmotu.recall
out[mockmotu.name]['lowest_false'] = mockmotu.lowest_false
out[mockmotu.name]['accessory_genepool'] = mockmotu.size_accessory
out[mockmotu.name]['mean_new_completness'] = mean([b.new_completness for b in mockmotu])
return out
65 changes: 17 additions & 48 deletions mOTUlizer/classes/mOTU.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ def __init__(self, **kwargs):
def __for_mOTUpan(self, name, faas, cog_dict, checkm_dict, max_it = 20):
self.name = name
self.faas = faas
print("Creating mOTU for mOTUpan")
if not cog_dict :
tt = compute_COGs(self.faas, name = name + "COG")
self.cog_dict = tt['genome2cogs']
Expand All @@ -44,13 +45,14 @@ def __for_mOTUpan(self, name, faas, cog_dict, checkm_dict, max_it = 20):
checkm_dict[f] = 100*len(self.cog_dict[f])/max_len

self.members = [MetaBin(bin_name, cogs = self.cog_dict[bin_name], faas = self.faas.get(bin_name), fnas = None, complet = checkm_dict.get(bin_name)) for bin_name in self.cog_dict.keys()]
self.core = None

self.cogCounts = {c : 0 for c in set.union(*[mag.cogs for mag in self.members])}
for mag in self.members:
for cog in mag.cogs:
self.cogCounts[cog] += 1

self.core = {cog for cog, counts in self.cogCounts.items() if (100*counts/len(self)) > mean(checkm_dict.values())}

self.likelies = self.__core_likelyhood(max_it = max_it)

def __getitem__(self, i):
Expand Down Expand Up @@ -114,7 +116,7 @@ def get_stats(self):
"core" : list(self.core) if self.core else None,
"aux_genome" : [k for k,v in self.cogCounts.items() if k not in self.core] if self.core else None ,
"singleton_cogs" : [k for k,v in self.cogCounts.items() if k not in self.core if v == 1] if self.core else None,
"cogs" : {'genome' : {k : list(v) for k,v in self.cog_dict.items()}, 'aa' : self.aa2cog} if self.aa2cog else None,
"cogs" : {'genome' : {k : list(v) for k,v in self.cog_dict.items()}, 'aa' : self.aa2cog} if self.aa2cog else ({k : list(v) for k,v in self.cog_dict.items()} if self.cog_dict else None),
"mean_ANI" : self.get_mean_ani() if (hasattr(self, 'fastani_dict') or all([hasattr(g, "genome") for g in self])) else None,
"ANIs" : [[k[0], k[1], v] for k, v in self.fastani_matrix().items()] if (hasattr(self, 'fastani_dict') or all([hasattr(g, "genome") for g in self])) else None,
"genomes" : [v.get_data() for v in self],
Expand All @@ -130,9 +132,9 @@ def get_mean_ani(self):

return {'mean_ANI' : sum([d for d in dists if d])/len(found_edges) if len(found_edges) > 0 else None, 'missing_edges' : missing_edges, 'total_edges' : len(found_edges) + missing_edges}

def __core_likelyhood(self, max_it = 20 ):
def __core_likelyhood(self, max_it = 20, likeli_cutof = 0 ):
likelies = {cog : self.__core_likely(cog) for cog in self.cogCounts}
self.core = set([c for c, v in likelies.items() if v > 0])
self.core = set([c for c, v in likelies.items() if v > likeli_cutof])
core_len = len(self.core)
i = 1
print("iteration 1 : ", core_len, "LHR:" , sum(likelies.values()), file = sys.stderr)
Expand All @@ -142,10 +144,11 @@ def __core_likelyhood(self, max_it = 20 ):
else :
mag.new_completness = 0
mag.new_completness = mag.new_completness if mag.new_completness < 99.9 else 99.9
mag.new_completness = mag.new_completness if mag.new_completness > 0 else 0.1
mag.new_completness = mag.new_completness if mag.new_completness > 0 else 0.01
for i in range(2,max_it):
likelies = {cog : self.__core_likely(cog, complet = "new", core_size = core_len) for cog in self.cogCounts}
self.core = set([c for c, v in likelies.items() if v > 0])
old_core = self.core
self.core = set([c for c, v in likelies.items() if v > likeli_cutof])
new_core_len = len(self.core)
for mag in self:
if len(self.core) > 0:
Expand All @@ -155,7 +158,7 @@ def __core_likelyhood(self, max_it = 20 ):
mag.new_completness = mag.new_completness if mag.new_completness < 99.9 else 99.9
mag.new_completness = mag.new_completness if mag.new_completness > 0 else 0.01
print("iteration",i, ": ", new_core_len, "LHR:" , sum(likelies.values()), file = sys.stderr)
if new_core_len == core_len:
if self.core == old_core:
break
else :
core_len =new_core_len
Expand All @@ -172,15 +175,19 @@ def __core_prob(self, cog, complet = "checkm"):
return sum(presence + abscence)

def __pange_prob(self, cog, core_size, complet = "checkm"):
pool_size = sum(self.cogCounts.values())
# pool_size = sum(self.cogCounts.values())
pool_size = sum([c for k,c in self.cogCounts.items()])
comp = lambda mag : (mag.checkm_complet if complet =="checkm" else mag.new_completness)/100
#presence = [1 - (1-self.cogCounts[cog]/pool_size)**(len(mag.cogs)-(core_size*comp(mag))) for mag in self if cog in mag.cogs]
#abscence = [ (1-self.cogCounts[cog]/pool_size)**(len(mag.cogs)-(core_size*comp(mag))) for mag in self if cog not in mag.cogs]

# presence = [ log10(1 - ( 1 - 1/len(self.cogCounts))**(len(mag.cogs)-(core_size*comp(mag)))) for mag in self if cog in mag.cogs]
# abscence = [ log10(( 1 - 1/len(self.cogCounts))**(len(mag.cogs)-(core_size*comp(mag)))) for mag in self if cog not in mag.cogs]
# mag_prob = {mag : ( 1-1/pool_size )**len(mag.cogs) for mag in self}
# mag_prob = {mag : ( 1-1/pool_size )**(len(mag.cogs)-(core_size*comp(mag))) for mag in self}
# mag_prob = {mag : ( 1-self.cogCounts[cog]/pool_size )**(len(mag.cogs)-(core_size*comp(mag))) for mag in self}
mag_prob = {mag : ( 1-self.cogCounts[cog]/pool_size )**(len(mag.cogs)-(core_size*comp(mag))) for mag in self}

# mag_prob = {mag : ( 1-self.cogCounts[cog]/pool_size)**len(mag.cogs) for mag in self}

presence = [ log10(1 - mag_prob[mag]) if mag_prob[mag] < 1 else MIN_PROB for mag in self if cog in mag.cogs]
abscence = [ log10( mag_prob[mag]) if mag_prob[mag] > 0 else log10(1-(10**MIN_PROB)) for mag in self if cog not in mag.cogs]

Expand All @@ -200,44 +207,6 @@ def get_pangenome_size(self, singletons = False):
return len([k for k,v in self.cogCounts.items() if k not in self.core and v > (0 if singletons else 1)])


def rarefy_pangenome(self, reps = 100, singletons = False, custom_cogs = None):
def __min_95(ll):
ll.sort_values()
return list(ll.sort_values())[round(len(ll)*0.05)]

def __max_95(ll):
ll.sort_values()
return list(ll.sort_values())[round(len(ll)*0.95)]

def __genome_count(ll):
return ll[0]

__genome_count.__name__ = "genome_count"
__max_95.__name__ = "max_95"
__min_95.__name__ = "min_95"

pange = set.union(*custom_cogs) if custom_cogs else {k for k,v in self.cogCounts.items() if k not in self.core and v > (0 if singletons else 1)}
series = []
for i in range(reps):
series += [{ 'rep' : i , 'genome_count' : 0, 'pangenome_size' : 0}]
m = custom_cogs if custom_cogs else [m.cogs for m in self.members.copy()]
shuffle(m)
founds = set()
for j,mm in enumerate(m):
founds = founds.union(mm.intersection(pange))
series += [{ 'rep' : i , 'genome_count' : j+1, 'pangenome_size' : len(founds)}]

t_pandas = pandas.DataFrame.from_records(series)[['genome_count', 'pangenome_size']]
t_pandas = t_pandas.groupby('genome_count').agg({'genome_count' : [__genome_count] ,'pangenome_size' : [mean, std, __min_95, __max_95]} )
t_pandas.columns = ["rr_" + p[1] for p in t_pandas.columns]
t_pandas = t_pandas.to_dict(orient="index")
tt = self.get_otu_stats()
for v in t_pandas.values():
v.update(tt)
return t_pandas



def __from_bins(self, bins, name,dist_dict = None ):
self.name = name

Expand Down
2 changes: 1 addition & 1 deletion mOTUlizer/config.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
THREADS = 20
MIN_PROB = -10 # log10 of prob of impossible event
MIN_PROB = -1000 # log10 of prob of impossible event
DB_FOLDER = "/home/moritz/dbs/"
Loading

0 comments on commit 1544306

Please sign in to comment.