-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
13 changed files
with
25,539 additions
and
62 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
from random import random, sample | ||
from mOTUlizer.classes.mOTU import mOTU, mean | ||
from collections import defaultdict | ||
from tqdm import tqdm | ||
from numpy.random import normal | ||
from numpy import floor, mean | ||
|
||
genome2guass = {} | ||
|
||
with open("mOTUlizer/data/Efeacalis.csv") as handle: | ||
emp_disp = [int(l[:-1].split(",")[-1])/691 for l in handle if "accessory" in l] | ||
gene_count = sum(emp_disp) | ||
|
||
class MockmOTU(mOTU): | ||
def __repr__(self) : | ||
return "< MockmOTU with {n} genomes, of average {c}% completness, with core/genome_len of {r} >".format(c = 100*self.mean_completeness, n = len(self), r = self.ratio) | ||
|
||
def __init__(self, name, core_len, nb_genomes, completeness, max_it = 20): | ||
|
||
core = {"CoreTrait_{}".format(i) for i in range(core_len)} | ||
|
||
|
||
sub_dist = [int(nb_genomes/i) for i in range(2,1000) if int(nb_genomes/i) > 0] + [1]*100 | ||
sub_dist = list(range(nb_genomes-1, 1,-1)) | ||
|
||
self.size_accessory = sum(sub_dist) | ||
self.mean_size_accessory = sum(sub_dist)/nb_genomes | ||
|
||
mock_genomes = dict() | ||
for k in range(nb_genomes): | ||
mock_genomes["Genome_{}".format(k)] = list(core) | ||
|
||
for i,v in enumerate(sub_dist): | ||
genomes = sample(list(mock_genomes.keys()), v) | ||
for g in genomes: | ||
mock_genomes[g] += ["AccessoryTrait_{}".format(i)] | ||
|
||
self.incompletes = {g : {vv for vv in v if random() < (completeness(g)/100)} for g, v in mock_genomes.items()} | ||
|
||
for k, v in self.incompletes.items(): | ||
if len(v) == 0: | ||
choice(core) | ||
|
||
self.mean_completeness = mean([len({vv for vv in v if vv.startswith("CoreTrait_")})/core_len for c,v in self.incompletes.items()]) | ||
self.completenesses = {c : 100*len({vv for vv in v if vv.startswith("CoreTrait_")})/core_len for c,v in self.incompletes.items()} | ||
# self.accessory = accessory | ||
self.mean_size = mean([len(m) for m in mock_genomes.values()]) | ||
self.real_core_len = core_len | ||
|
||
zerifneg = lambda g: 0.001 if g < 0 else g | ||
super().__init__(name = name, faas = {}, cog_dict = self.incompletes, checkm_dict = { k : zerifneg(normal(v, 10)) for k,v in self.completenesses.items()}, max_it = max_it) | ||
self.recall = len(core.intersection(self.core))/core_len | ||
self.lowest_false = {k : v for k,v in self.cogCounts.items() if k in self.core and k not in core} | ||
self.lowest_false = 1 if(len(self.lowest_false) ==0) else min(self.lowest_false.items(), key = lambda x : x[1])[1]/len(self) | ||
|
||
def mock_cog_stats(self): | ||
all_genes = set.union(*self.incompletes.values()) | ||
outp = {t : {} for t in all_genes} | ||
for t,dd in outp.items(): | ||
dd['freq'] = sum([t in zz for zz in self.incompletes.values()])/len(self.incompletes) | ||
dd['core'] = t in self.core | ||
dd['type'] = "core" if t.startswith("CoreTrait_") else "accessory" | ||
dd['nb_genomes'] = len(self) | ||
dd['core_len'] = len(self.core) | ||
dd['real_core_len'] = self.read_core_len | ||
dd['llikelihood'] = self.likelies[t] | ||
dd['len_accessory_genome'] = len(all_genes) - dd['real_core_len'] | ||
return outp | ||
|
||
@classmethod | ||
def guauss_completes(cls, g, mean_completeness = 60, stdev = 10): | ||
if g in genome2guass: | ||
return genome2guass[g] | ||
else : | ||
out_prob = 1000 | ||
while(not (20 < out_prob < 99) ): | ||
out_prob = normal(mean_completeness, stdev) | ||
genome2guass[g] = out_prob | ||
return out_prob | ||
|
||
@classmethod | ||
def run_boots(cls): | ||
out = {} | ||
|
||
for i in range(1000, 2500, 500): | ||
for nb in range(10, 250, 15): | ||
for c in range(30, 100, 5): | ||
MockData.genome2guass = {} | ||
mockmotu = MockmOTU("complete_{}_core_size_{}_nbgenomes_{}".format(c,i,nb).format(c), i, nb, lambda g : MockmOTU.guauss_completes(g, mean_completeness = c, stdev = 10), max_it = 100) | ||
out[mockmotu.name] = {} | ||
out[mockmotu.name]['core_size'] = i | ||
out[mockmotu.name]['nb_genomes'] = nb | ||
out[mockmotu.name]['mean_completeness'] = mockmotu.mean_completeness | ||
out[mockmotu.name]['mean_genome_size'] = mockmotu.mean_size | ||
out[mockmotu.name]['recall'] = mockmotu.recall | ||
out[mockmotu.name]['lowest_false'] = mockmotu.lowest_false | ||
out[mockmotu.name]['accessory_genepool'] = mockmotu.size_accessory | ||
out[mockmotu.name]['mean_new_completness'] = mean([b.new_completness for b in mockmotu]) | ||
return out |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,3 @@ | ||
THREADS = 20 | ||
MIN_PROB = -10 # log10 of prob of impossible event | ||
MIN_PROB = -1000 # log10 of prob of impossible event | ||
DB_FOLDER = "/home/moritz/dbs/" |
Oops, something went wrong.