RFCT/ENH Add fasta.py

Simpler code, adds .xz support
BigDataBiology · May 16, 2022 · 8b9d148 · 8b9d148
1 parent 305344d
commit 8b9d148
Show file tree

Hide file tree

Showing 4 changed files with 62 additions and 15 deletions.
diff --git a/ChangeLog b/ChangeLog
@@ -1,3 +1,7 @@
+Unreleased
+	* Refactor fasta parsing (add .xz support)
+	* Add IPython outputs
+
 Version 0.2.0 2020-09-03
 	* Print out command line exactly & working dir in runlog
 	* Update to newer API version

diff --git a/README.md b/README.md
@@ -95,11 +95,11 @@ the outputs is also written to output folder for convenience.
 
 ## Parameters
 
-* `-i/--input`: path to the input genome file(.fasta/.gz/.bz2).
+* `-i/--input`: path to the input genome file (FASTA, possibly .gz/.bz2/.xz compressed).
 
 * `-o/--output`: Output directory (will be created if non-existent).
 
-* `--nt-genes`: path to the input DNA gene file(.fasta/.gz/.bz2).
+* `--nt-genes`: path to the input DNA gene file (FASTA, possibly .gz/.bz2/.xz compressed).
 
-* `--aa-genes`: path to the input Protein gene file(.fasta/.gz/.bz2).
+* `--aa-genes`: path to the input Protein gene file (FASTA, possibly .gz/.bz2/.xz compressed).
 
diff --git a/gmgc_mapper/fasta.py b/gmgc_mapper/fasta.py
@@ -0,0 +1,49 @@
+def fasta_iter(fname, full_header=False):
+    '''Iterate over a (possibly gzipped) FASTA file
+
+    Parameters
+    ----------
+    fname : str
+        Filename.
+            If it ends with .gz, gzip format is assumed
+            If .bz2 then bzip2 format is assumed
+            if .xz, then lzma format is assumerd
+    full_header : boolean (optional)
+        If True, yields the full header. Otherwise (the default), only the
+        first word
+
+    Yields
+    ------
+    (h,seq): tuple of (str, str)
+    '''
+    header = None
+    chunks = []
+    if fname.endswith('.gz'):
+        import gzip
+        op = gzip.open
+    elif fname.endswith('.bz2'):
+        import bz2
+        op = bz2.open
+    elif fname.endswith('.xz'):
+        import lzma
+        op = lzma.open
+    else:
+        op = open
+    with op(fname, 'rt') as f:
+        for line in f:
+            if line[0] == '>':
+                if header is not None:
+                    yield header,''.join(chunks)
+                line = line[1:].strip()
+                if not line:
+                    header = ''
+                elif full_header:
+                    header = line.strip()
+                else:
+                    header = line.split()[0]
+                chunks = []
+            else:
+                chunks.append(line.strip())
+        if header is not None:
+            yield header, ''.join(chunks)
+
diff --git a/gmgc_mapper/main.py b/gmgc_mapper/main.py
@@ -19,6 +19,8 @@
 import yaml
 import numpy as np
 from atomicwrites import atomic_write
+
+from .fasta import fasta_iter
 from .alignment import identity_coverage
 from .gmgc_mapper_version import __version__
 
@@ -219,19 +221,11 @@ def alignment(record_dna,record_aa,hit_index,dna = False):
             gene_information.extend(gene_origin)
     return results,gene_information
 
-def gene_num(gene):
+def number_seqs_fafile(fafile):
     """
     return the number of sequence in a file(fasta, .gz , .bz2)
     """
-    if os.path.splitext(gene)[1] == '.gz':
-        with gzip.open(gene, "rt") as handle:
-            return len(list(SeqIO.parse(handle, "fasta")))
-
-    if os.path.splitext(gene)[1] == '.bz2':
-        with bz2.open(gene, "rt") as handle:
-            return len(list(SeqIO.parse(handle, "fasta")))
-
-    return len(list(SeqIO.parse(gene, "fasta")))
+    return len(_ for _ in fasta_iter(fafile))
 
 
 def query_genome_bin(hit_table):
@@ -294,8 +288,8 @@ def main(args=None):
                                        output_dir=tmpdirname + '/split_file',is_dna=True)
             else:
                 if args.nt_input is not None:
-                    n_nt = gene_num(args.nt_input)
-                    n_aa = gene_num(args.aa_input)
+                    n_nt = number_seqs_fafile(args.nt_input)
+                    n_aa = number_seqs_fafile(args.aa_input)
                     if n_nt != n_aa:
                         sys.stderr.write("Input DNA and amino acide gene files must have the same sequence number!\n")
                         sys.stderr.write(f"DNA file has {n_nt} while amino acid file has {n_aa}!")