From 044484c608e20fd8f4f7b6c195dd1466803be0c5 Mon Sep 17 00:00:00 2001
From: moritzbuck <moritz.buck@slu.se>
Date: Mon, 14 Jun 2021 11:00:14 +0200
Subject: [PATCH] fixed bug rounding down ANI values to the next integer

---
 mOTUlizer/__init__.py           | 2 +-
 mOTUlizer/bin/mOTUlize.py       | 5 +++--
 mOTUlizer/classes/MetaBin.py    | 4 +++-
 mOTUlizer/scripts/prochloros.py | 8 ++++++--
 4 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/mOTUlizer/__init__.py b/mOTUlizer/__init__.py
index a53db70..6b2783f 100644
--- a/mOTUlizer/__init__.py
+++ b/mOTUlizer/__init__.py
@@ -10,7 +10,7 @@
 if os.path.exists(".git"):
     label = subprocess.check_output(["git", "describe", "--tags"]).strip().decode()
 else:
-    label = "0.2.2"
+    label = "0.2.3"
 
 os.chdir(cwd)
 
diff --git a/mOTUlizer/bin/mOTUlize.py b/mOTUlizer/bin/mOTUlize.py
index 5f7dade..785f6bf 100755
--- a/mOTUlizer/bin/mOTUlize.py
+++ b/mOTUlizer/bin/mOTUlize.py
@@ -28,6 +28,7 @@
 
 """
 
+fasta_exts = [".fna", ".fa", ".fasta", ".fna", ".ffn"]
 
 def motulize(args):
     #parse and check your amino-acid files
@@ -72,8 +73,8 @@ def motulize(args):
             for l in handle:
                 if "query" not in l:
                     ll = l.split("\t")
-                    g1 = ".".join(os.path.basename(ll[0]).split(".")[:-1]) if ll[0].endswith(".fna") else ll[0]
-                    g2 = ".".join(os.path.basename(ll[1]).split(".")[:-1]) if ll[1].endswith(".fna") else ll[1]
+                    g1 = ".".join(os.path.basename(ll[0]).split(".")[:-1]) if any([ll[0].endswith(ext) for ext in fasta_exts]) else ll[0]
+                    g2 = ".".join(os.path.basename(ll[1]).split(".")[:-1]) if any([ll[1].endswith(ext) for ext in fasta_exts]) else ll[1]
                     dist = float(ll[2])
                     dist_dict[(g1,g2)] = dist
     else :
diff --git a/mOTUlizer/classes/MetaBin.py b/mOTUlizer/classes/MetaBin.py
index 03acbd8..f86d6db 100644
--- a/mOTUlizer/classes/MetaBin.py
+++ b/mOTUlizer/classes/MetaBin.py
@@ -71,7 +71,7 @@ def get_anis(cls, bins, outfile = None, method = "fastANI", block_size = 500, th
                         out_tfile = tempfile.NamedTemporaryFile().name
                         call("fastANI --ql {b1} --rl {b2} -o {out} -t {threads} 2> /dev/null".format(b1 = b1_tfile, b2 = b2_tfile, out = out_tfile, threads = threads), shell = True)
                         with open(out_tfile) as handle:
-                            new_dat = ["\t".join([".".join(ll.split("/")[-1].split(".")[:-1]) if "." in ll else ll for ll in l.split()]) +"\n" for l in handle.readlines()]
+                            new_dat = ["\t".join([ll for ll in l.split()]) +"\n" for l in handle.readlines()]
                         with open(fastani_file, "a") as handle:
                             handle.writelines(new_dat)
 
@@ -82,6 +82,8 @@ def get_anis(cls, bins, outfile = None, method = "fastANI", block_size = 500, th
             with open(fastani_file) as handle:
                 handle.readline()
                 out_dists = {(l.split()[0], l.strip().split()[1]) : float(l.split()[2]) for l in handle}
+#                tfile = lambda k : ".".join(k.split(".")[:-1]) if (k.endswith(".fna") or k.endswith(".fa") or k.endswith(".fasta") or k.endswith(".fna") or k.endswith(".ffn")) else k
+#                out_dists = {(tfile(k[0]),tfile(k[1])) : v for k,v in out_dists.items()}
             if outfile is None:
                 os.remove(fastani_file)
         else :
diff --git a/mOTUlizer/scripts/prochloros.py b/mOTUlizer/scripts/prochloros.py
index f4bde54..1649c0c 100644
--- a/mOTUlizer/scripts/prochloros.py
+++ b/mOTUlizer/scripts/prochloros.py
@@ -28,6 +28,9 @@
 stratfresh = pandas.read_csv("/home/moritz/data/data_submit/metadata/master_table.csv", index_col=0)
 #stratfresh_motus = pandas.read_csv("/home/moritz/data/data_submit/metadata/Supplementary_Table_S7_-_Data_about_mOTUs.csv", index_col=0)
 #stratfresh_motus["est_size"] = list(100*stratfresh.loc[stratfresh_motus.representative_MAGs].length/stratfresh.loc[stratfresh_motus.representative_MAGs].completeness/1000000)
+with open("stratfreshmotus.json") as handle :
+    stratfresh_motus = json.load(handle)
+
 with open("stratfreshmotus.json") as handle :
     bin2stratfreshmotu = { g['name'] : k for k,v in tqdm(json.load(handle).items()) for g in v['genomes']  }
 
@@ -185,7 +188,7 @@ def process_species_cores():
             scaffold_count = stratfresh.loc[tliss].nb_contigs if type =="stratfreshdb" else r95.loc[tliss].scaffold_count
             new_comps_roary = {gid.replace("RS_","").replace("GB_","") : len(v['roary_cogs'][gid.replace("RS_","").replace("GB_","")].intersection(v['motupan_roary']))/len(v['motupan_roary'])  for gid in v['gids']}
             new_comps_ppan = {gid.replace("RS_","").replace("GB_","") : len(v['ppan_cogs'][gid.replace("RS_","").replace("GB_","")].intersection(v['motupan']))/len(v['motupan'])  for gid in v['gids']}
-            goods = set(goods)
+#            goods = set(goods)
             core_stats[k] = {
             'taxo' : k if type == "gtdb" else "tbd", #stratfresh_motus.loc[k].consensus_tax,
             'motupan_w_ppan' : len(v['motupan']),
@@ -201,7 +204,8 @@ def process_species_cores():
             'mean_est_roary_cogs' :  mean([ len(v['roary_cogs'][k])/c  for k,c in new_comps_ppan.items() if c > 0.4] ),
             'mean_scaff_count' : mean(scaffold_count),
             'type' : type,
-            'est_size' :  r95.loc[gtdb2rep[k]].est_size if type == "gtdb" else  -1 #stratfresh_motus.loc[k].est_size
+            'est_size' :  r95.loc[gtdb2rep[k]].est_size if type == "gtdb" else  stratfresh_motus.loc[k].est_size
+            'genoms_ids' : ";".joinb(v['gids '])
             }