Skip to content

Commit

Permalink
RFCT/ENH Add fasta.py
Browse files Browse the repository at this point in the history
Simpler code, adds .xz support
  • Loading branch information
luispedro committed May 16, 2022
1 parent 305344d commit 8b9d148
Show file tree
Hide file tree
Showing 4 changed files with 62 additions and 15 deletions.
4 changes: 4 additions & 0 deletions ChangeLog
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
Unreleased
* Refactor fasta parsing (add .xz support)
* Add IPython outputs

Version 0.2.0 2020-09-03
* Print out command line exactly & working dir in runlog
* Update to newer API version
Expand Down
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,11 +95,11 @@ the outputs is also written to output folder for convenience.

## Parameters

* `-i/--input`: path to the input genome file(.fasta/.gz/.bz2).
* `-i/--input`: path to the input genome file (FASTA, possibly .gz/.bz2/.xz compressed).

* `-o/--output`: Output directory (will be created if non-existent).

* `--nt-genes`: path to the input DNA gene file(.fasta/.gz/.bz2).
* `--nt-genes`: path to the input DNA gene file (FASTA, possibly .gz/.bz2/.xz compressed).

* `--aa-genes`: path to the input Protein gene file(.fasta/.gz/.bz2).
* `--aa-genes`: path to the input Protein gene file (FASTA, possibly .gz/.bz2/.xz compressed).

49 changes: 49 additions & 0 deletions gmgc_mapper/fasta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
def fasta_iter(fname, full_header=False):
'''Iterate over a (possibly gzipped) FASTA file
Parameters
----------
fname : str
Filename.
If it ends with .gz, gzip format is assumed
If .bz2 then bzip2 format is assumed
if .xz, then lzma format is assumerd
full_header : boolean (optional)
If True, yields the full header. Otherwise (the default), only the
first word
Yields
------
(h,seq): tuple of (str, str)
'''
header = None
chunks = []
if fname.endswith('.gz'):
import gzip
op = gzip.open
elif fname.endswith('.bz2'):
import bz2
op = bz2.open
elif fname.endswith('.xz'):
import lzma
op = lzma.open
else:
op = open
with op(fname, 'rt') as f:
for line in f:
if line[0] == '>':
if header is not None:
yield header,''.join(chunks)
line = line[1:].strip()
if not line:
header = ''
elif full_header:
header = line.strip()
else:
header = line.split()[0]
chunks = []
else:
chunks.append(line.strip())
if header is not None:
yield header, ''.join(chunks)

18 changes: 6 additions & 12 deletions gmgc_mapper/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
import yaml
import numpy as np
from atomicwrites import atomic_write

from .fasta import fasta_iter
from .alignment import identity_coverage
from .gmgc_mapper_version import __version__

Expand Down Expand Up @@ -219,19 +221,11 @@ def alignment(record_dna,record_aa,hit_index,dna = False):
gene_information.extend(gene_origin)
return results,gene_information

def gene_num(gene):
def number_seqs_fafile(fafile):
"""
return the number of sequence in a file(fasta, .gz , .bz2)
"""
if os.path.splitext(gene)[1] == '.gz':
with gzip.open(gene, "rt") as handle:
return len(list(SeqIO.parse(handle, "fasta")))

if os.path.splitext(gene)[1] == '.bz2':
with bz2.open(gene, "rt") as handle:
return len(list(SeqIO.parse(handle, "fasta")))

return len(list(SeqIO.parse(gene, "fasta")))
return len(_ for _ in fasta_iter(fafile))


def query_genome_bin(hit_table):
Expand Down Expand Up @@ -294,8 +288,8 @@ def main(args=None):
output_dir=tmpdirname + '/split_file',is_dna=True)
else:
if args.nt_input is not None:
n_nt = gene_num(args.nt_input)
n_aa = gene_num(args.aa_input)
n_nt = number_seqs_fafile(args.nt_input)
n_aa = number_seqs_fafile(args.aa_input)
if n_nt != n_aa:
sys.stderr.write("Input DNA and amino acide gene files must have the same sequence number!\n")
sys.stderr.write(f"DNA file has {n_nt} while amino acid file has {n_aa}!")
Expand Down

0 comments on commit 8b9d148

Please sign in to comment.