Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Output GFF files from eggnog-diamond-search and eggnog-hmmer-search #221

Open
wants to merge 16 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 22 additions & 4 deletions q2_moshpit/eggnog/orthologs/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@

from q2_types.feature_data import FeatureData
from q2_types.feature_data_mag import MAG, MAGSequencesDirFmt
from q2_types.genome_data import SeedOrthologDirFmt, OrthologFileFmt
from q2_types.genome_data import (SeedOrthologDirFmt, OrthologFileFmt,
LociDirectoryFormat)
from q2_types.per_sample_sequences import (
Contigs, MAGs, ContigSequencesDirFmt, MultiMAGSequencesDirFmt
)
Expand Down Expand Up @@ -86,16 +87,20 @@ def _run_eggnog_search_pipeline(
_eggnog_search = ctx.get_action("moshpit", search_action)
collate_hits = ctx.get_action("types", "collate_orthologs")
_eggnog_feature_table = ctx.get_action("moshpit", "_eggnog_feature_table")
collate_loci = ctx.get_action("types", "collate_loci")
(partitioned_sequences,) = partition_method(sequences, num_partitions)

hits = []
loci = []
for seq in partitioned_sequences.values():
(hit, _) = _eggnog_search(seq, *db, num_cpus, db_in_memory)
(hit, _, loci_dir) = _eggnog_search(seq, *db, num_cpus, db_in_memory)
hits.append(hit)
loci.append(loci_dir)

(collated_hits,) = collate_hits(hits)
(collated_tables,) = _eggnog_feature_table(collated_hits)
return collated_hits, collated_tables
(collated_loci,) = collate_loci(loci)
return collated_hits, collated_tables, collated_loci


def _search_runner(
Expand Down Expand Up @@ -152,6 +157,19 @@ def _eggnog_search(
for mag_id, mag_fp in mags.items():
search_runner(input_path=mag_fp, sample_label=mag_id)

# iterate over the gff files and move them to the correct location
loci_dir = LociDirectoryFormat()
gff_fp = [
os.path.basename(x) for x
in glob.glob(f'{output_loc}/*.emapper.genepred.gff')
]
for fn in gff_fp:
new_fn = fn.replace('.emapper.genepred.gff', '.gff')
qiime2.util.duplicate(
os.path.join(output_loc, fn),
os.path.join(loci_dir.path, new_fn)
)

result = SeedOrthologDirFmt()
ortholog_fps = [
os.path.basename(x) for x
Expand All @@ -164,7 +182,7 @@ def _eggnog_search(
)

ft = _eggnog_feature_table(result)
return result, ft
return result, ft, loci_dir


def _eggnog_feature_table(seed_orthologs: SeedOrthologDirFmt) -> pd.DataFrame:
Expand Down
15 changes: 8 additions & 7 deletions q2_moshpit/eggnog/orthologs/diamond.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
MAGSequencesDirFmt
)
from q2_types.genome_data import (
SeedOrthologDirFmt
SeedOrthologDirFmt, LociDirectoryFormat
)
from q2_types.per_sample_sequences import (
ContigSequencesDirFmt, MultiMAGSequencesDirFmt
Expand All @@ -35,25 +35,26 @@ def _eggnog_diamond_search(
],
diamond_db: DiamondDatabaseDirFmt,
num_cpus: int = 1,
db_in_memory: bool = False
) -> (SeedOrthologDirFmt, pd.DataFrame):
db_in_memory: bool = False,
) -> (SeedOrthologDirFmt, pd.DataFrame, LociDirectoryFormat):
with tempfile.TemporaryDirectory() as output_loc:
db_fp = os.path.join(str(diamond_db), 'ref_db.dmnd')
search_runner = partial(
_search_runner, output_loc=str(output_loc),
num_cpus=num_cpus, db_in_memory=db_in_memory,
runner_args=['diamond', '--dmnd_db', str(db_fp)]
)
result, ft = _eggnog_search(sequences, search_runner, str(output_loc))
return result, ft
result, ft, loci = _eggnog_search(sequences, search_runner,
str(output_loc))
return result, ft, loci


def eggnog_diamond_search(
ctx, sequences, diamond_db,
num_cpus=1, db_in_memory=False, num_partitions=None
):
collated_hits, collated_tables = _run_eggnog_search_pipeline(
collated_hits, collated_tables, loci = _run_eggnog_search_pipeline(
ctx, sequences, [diamond_db], num_cpus, db_in_memory, num_partitions,
"_eggnog_diamond_search"
)
return collated_hits, collated_tables
return collated_hits, collated_tables, loci
25 changes: 13 additions & 12 deletions q2_moshpit/eggnog/orthologs/hmmer.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from q2_moshpit.eggnog.types import EggnogHmmerIdmapDirectoryFmt
from q2_types.feature_data_mag import MAGSequencesDirFmt
from q2_types.genome_data import (
ProteinsDirectoryFormat, SeedOrthologDirFmt
ProteinsDirectoryFormat, SeedOrthologDirFmt, LociDirectoryFormat
)
from q2_types.per_sample_sequences import (
ContigSequencesDirFmt, MultiMAGSequencesDirFmt
Expand All @@ -38,7 +38,7 @@ def _eggnog_hmmer_search(
seed_alignments: ProteinsDirectoryFormat,
num_cpus: int = 1,
db_in_memory: bool = False
) -> (SeedOrthologDirFmt, pd.DataFrame):
) -> (SeedOrthologDirFmt, pd.DataFrame, LociDirectoryFormat):
with tempfile.TemporaryDirectory() as output_loc:
taxon_id = os.listdir(idmap.path)[0].split(".")[0]
tmp_subdir = f"{output_loc}/hmmer/{taxon_id}"
Expand All @@ -54,17 +54,18 @@ def _eggnog_hmmer_search(
'--genepred', 'prodigal' # default incompatible with HMMER
]
)
result, ft = _eggnog_search(sequences, search_runner, output_loc)
return result, ft
result, ft, loci = _eggnog_search(sequences, search_runner, output_loc)
return result, ft, loci


def eggnog_hmmer_search(
ctx, sequences, pressed_hmm_db, idmap, seed_alignments,
num_cpus=1, db_in_memory=False, num_partitions=None
ctx, sequences, pressed_hmm_db, idmap, seed_alignments,
num_cpus=1, db_in_memory=False, num_partitions=None
):
collated_hits, collated_tables = _run_eggnog_search_pipeline(
ctx, sequences, [idmap, pressed_hmm_db, seed_alignments],
num_cpus, db_in_memory, num_partitions,
"_eggnog_hmmer_search"
)
return collated_hits, collated_tables
collated_hits, collated_tables, collated_loci = (
_run_eggnog_search_pipeline(
ctx, sequences, [idmap, pressed_hmm_db, seed_alignments],
num_cpus, db_in_memory, num_partitions,
"_eggnog_hmmer_search"
))
return collated_hits, collated_tables, collated_loci
36 changes: 19 additions & 17 deletions q2_moshpit/eggnog/tests/test_orthologs.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,11 @@ def setUp(self):
def test_eggnog_hmmer_search_pipeline(self):
mock_action = MagicMock(side_effect=[
lambda sequences, num_partitions: ({"mag1": {}, "mag2": {}},),
lambda seq, pressed, idmap, fastas, num_cpus, db_in_memory: (0, 0),
lambda seq, pressed, idmap, fastas, num_cpus, db_in_memory:
(0, 0, 0),
lambda hits: ("collated_hits",),
lambda collated_hits: ("collated_tables",),
lambda collated_loci: ("collated_loci",),
])
mock_ctx = MagicMock(get_action=mock_action)
obs = eggnog_hmmer_search(
Expand All @@ -85,7 +87,7 @@ def test_eggnog_hmmer_search_pipeline(self):
idmap=self.idmap_artifact,
seed_alignments=self.fastas_artifact
)
exp = ("collated_hits", "collated_tables")
exp = ("collated_hits", "collated_tables", "collated_loci")
self.assertTupleEqual(obs, exp)

def test_symlink_files_to_target_dir(self):
Expand Down Expand Up @@ -114,8 +116,8 @@ def test_eggnog_hmmer_search(
self, mock_eggnog_search, mock_symlink, mock_tmpdir, mock_makedirs
):
mock_tmpdir.return_value.__enter__.return_value = "tmp"
mock_eggnog_search.return_value = (0, 1)
result, ft = _eggnog_hmmer_search(
mock_eggnog_search.return_value = (0, 1, 2)
result, ft, loci = _eggnog_hmmer_search(
sequences=self.mags,
idmap=self.idmap,
pressed_hmm_db=self.pressed_hmm,
Expand All @@ -129,7 +131,7 @@ def test_eggnog_hmmer_search(
ANY, # partial() method not patchable or comparable
"tmp"
)
self.assertTupleEqual((result, ft), (0, 1))
self.assertTupleEqual((result, ft, loci), (0, 1, 2))

def test_eggnog_search_mags(self):
sequences = MultiMAGSequencesDirFmt(
Expand All @@ -138,7 +140,7 @@ def test_eggnog_search_mags(self):
output_loc = self.get_data_path('hits')
search_runner = MagicMock()

result, ft = _eggnog_search(sequences, search_runner, output_loc)
result, ft, _ = _eggnog_search(sequences, search_runner, output_loc)
result.validate()
self.assertIsInstance(ft, pd.DataFrame)

Expand All @@ -155,7 +157,7 @@ def test_eggnog_search_contigs(self):
output_loc = self.get_data_path('hits')
search_runner = MagicMock()

result, ft = _eggnog_search(sequences, search_runner, output_loc)
result, ft, _ = _eggnog_search(sequences, search_runner, output_loc)
result.validate()
self.assertIsInstance(ft, pd.DataFrame)

Expand All @@ -171,7 +173,7 @@ def test_eggnog_search_mags_derep(self):
output_loc = self.get_data_path('hits')
search_runner = MagicMock()

result, ft = _eggnog_search(sequences, search_runner, output_loc)
result, ft, _ = _eggnog_search(sequences, search_runner, output_loc)
result.validate()
self.assertIsInstance(ft, pd.DataFrame)

Expand Down Expand Up @@ -218,7 +220,7 @@ def test_good_small_search_contigs(self):
self.get_data_path('contig-sequences-1')
).view(ContigSequencesDirFmt)

_, obs = _eggnog_diamond_search(
_, obs, _ = _eggnog_diamond_search(
sequences=contigs,
diamond_db=self.diamond_db
)
Expand All @@ -234,7 +236,7 @@ def test_good_small_search_mags_derep(self):
self.get_data_path('mag-sequences')
).view(MAGSequencesDirFmt)

_, obs = _eggnog_diamond_search(
_, obs, _ = _eggnog_diamond_search(
sequences=mags,
diamond_db=self.diamond_db
)
Expand All @@ -253,7 +255,7 @@ def test_good_small_search_mags(self):
self.get_data_path('mag-sequences-per-sample')
).view(MultiMAGSequencesDirFmt)

_, obs = _eggnog_diamond_search(
_, obs, _ = _eggnog_diamond_search(
sequences=mags,
diamond_db=self.diamond_db
)
Expand All @@ -279,12 +281,12 @@ def test_eggnog_search_parallel_contigs(self):
)

with self.test_config:
_, parallel = self.eggnog_diamond_search.parallel(
_, parallel, _ = self.eggnog_diamond_search.parallel(
contigs,
self.diamond_db_artifact
)._result()

_, single = self._eggnog_diamond_search(
_, single, _ = self._eggnog_diamond_search(
sequences=contigs,
diamond_db=self.diamond_db_artifact
)
Expand All @@ -301,12 +303,12 @@ def test_eggnog_search_parallel_mags_derep(self):
)

with self.test_config:
_, parallel = self.eggnog_diamond_search.parallel(
_, parallel, _ = self.eggnog_diamond_search.parallel(
mags,
self.diamond_db_artifact
)._result()

_, single = self._eggnog_diamond_search(
_, single, _ = self._eggnog_diamond_search(
sequences=mags,
diamond_db=self.diamond_db_artifact
)
Expand All @@ -323,12 +325,12 @@ def test_eggnog_search_parallel_mags(self):
)

with self.test_config:
_, parallel = self.eggnog_diamond_search.parallel(
_, parallel, _ = self.eggnog_diamond_search.parallel(
mags,
self.diamond_db_artifact
)._result()

_, single = self._eggnog_diamond_search(
_, single, _ = self._eggnog_diamond_search(
sequences=mags,
diamond_db=self.diamond_db_artifact
)
Expand Down
22 changes: 14 additions & 8 deletions q2_moshpit/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -708,7 +708,8 @@
},
outputs=[
('eggnog_hits', SampleData[Orthologs]),
('table', FeatureTable[Frequency])
('table', FeatureTable[Frequency]),
('loci', GenomeData[Loci])
],
name='Run eggNOG search using diamond aligner',
description="This method performs the steps by which we find our "
Expand Down Expand Up @@ -752,7 +753,8 @@
},
outputs=[
('eggnog_hits', SampleData[Orthologs]),
('table', FeatureTable[Frequency])
('table', FeatureTable[Frequency]),
('loci', GenomeData[Loci]),
],
name='Run eggNOG search using HMMER aligner',
description="This method uses HMMER to find possible target sequences "
Expand All @@ -772,11 +774,11 @@
},
parameters={
'num_cpus': Int,
'db_in_memory': Bool,
'db_in_memory': Bool
},
input_descriptions={
'sequences': 'Sequences to be searched for ortholog hits.',
'diamond_db': 'Diamond database.',
'diamond_db': 'Diamond database.'
},
parameter_descriptions={
'num_cpus': 'Number of CPUs to utilize. \'0\' will '
Expand All @@ -788,12 +790,14 @@
},
outputs=[
('eggnog_hits', SampleData[Orthologs]),
('table', FeatureTable[Frequency])
('table', FeatureTable[Frequency]),
('loci', GenomeData[Loci])
],
output_descriptions={
'eggnog_hits': 'BLAST6-like table(s) describing the identified '
'orthologs. One table per sample or MAG in the input.',
'table': 'Feature table with counts of orthologs per sample/MAG.'
'table': 'Feature table with counts of orthologs per sample/MAG.',
'loci': 'Loci of the identified orthologs.'
},
name='Run eggNOG search using Diamond aligner',
description="This method performs the steps by which we find our "
Expand Down Expand Up @@ -838,12 +842,14 @@
},
outputs=[
('eggnog_hits', SampleData[Orthologs]),
('table', FeatureTable[Frequency])
('table', FeatureTable[Frequency]),
('loci', GenomeData[Loci])
],
output_descriptions={
'eggnog_hits': 'BLAST6-like table(s) describing the identified '
'orthologs. One table per sample or MAG in the input.',
'table': 'Feature table with counts of orthologs per sample/MAG.'
'table': 'Feature table with counts of orthologs per sample/MAG.',
'loci': 'Loci of the identified orthologs.'
},
name='Run eggNOG search using HMMER aligner',
description='This method performs the steps by which we find our '
Expand Down
Loading