add user protein tests #76

oschwengers · Sep 9, 2021 · 1541f10 · 1541f10
1 parent 992d929
commit 1541f10
Show file tree

Hide file tree

Showing 4 changed files with 198 additions and 3 deletions.
diff --git a/test/data/user-proteins.faa b/test/data/user-proteins.faa
@@ -0,0 +1,4 @@
+>VFDB_test 90~~~90~~~90~~~yaxA~~~cytotoxin YaxA~~~VFDB:VFG045347,VFDB:VF0511
+MTQTQLAIDNVLASAENTIQLNELPKVVLDFITGEQTSVARSGGIFTKEDLINLKLYVRKGLSLPTRQDEVEAYLGYKKIDVAGLEPKDIKLLFDEIHNHALNWNDVEQAVLQQSLDLDIAAKNIISTGNEIINLINQMPITLRVKTLLRDITDKQLENITYESADHEVASALKDILDDMKGDINRHQTTTENVRKKVSDYRITLTGGELSSGDKVNGLEPQVKTKYDLMEKSNMRKSIKELDEKIKEKKQRIEQLKKDYDKFVGLSFTGAIGGIIAMAITSGIFGAKAENARKEKNALISEVAELESKVSSQRALQTALEALSLSFSDIGIRMVDAESALNHLDFMWLSVLNQITESQIQFAMINNALRLTSFVNKFQQVITPWQSVGDSARQLVDIFDEAIKEYKKVYG
+>hypo-mock-test 99~~~99~~~99~~~mock1~~~mock hypothetical user protein 1~~~USERDB:MOCK1
+MAQNPFKALNINIDKIESALTQNGVTNYSSNVKNERETHISGTYKGIDFLIKLMPSGGNTTIGRASGQNNTYFDEIALIIKENCLYSDTKNFEYTIPKFSDDDRANLFEFLSEEGITITEDNNNDPNCKHQYIMTTSNGDRVRAKIYKRGSIQFQGKYLQIASLINDFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKKQLSCSLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNLGEYFTENKPKYIIREIHQETINGEIAEVLCECYTYWHENRHGLFHMKPGIADTKTINKLESIAIIDTVCQLIDGGVARLKL
diff --git a/test/test_args.py b/test/test_args.py
@@ -139,6 +139,34 @@ def test_replicons_ok(tmpdir):
         assert Path.exists(tmpdir_path.joinpath(file))
 
 
+@pytest.mark.parametrize(
+    'parameters',
+    [
+        (['--proteins']),  # not provided
+        (['--proteins', '']),  # empty
+        (['--proteins', 'foo'])  # not existing
+    ]
+)
+def test_proteins_failiing(parameters, tmpdir):
+    # test proteins file arguments
+
+    # missing path
+    proc = run(['bin/bakta', '--db', 'test/db', '--output', tmpdir] + parameters + ['test/data/NC_002127.1.fna'])
+    assert proc.returncode != 0
+
+
+@pytest.mark.slow
+def test_proteins_ok(tmpdir):
+    # test proteins file arguments
+
+    proc = run(['bin/bakta', '--db', 'test/db', '--output', tmpdir, '--prefix', 'test', '--proteins', 'test/data/user-proteins.faa'] + SKIP_PARAMETERS + ['test/data/NC_002127.1.fna'])
+    assert proc.returncode == 0
+
+    tmpdir_path = Path(tmpdir)
+    for file in FILES:
+        assert Path.exists(tmpdir_path.joinpath(file))
+
+
 def test_output_failing():
     # test database arguments
     cmd_line = ['bin/bakta', '--output', '/', 'test/data/draft-w-plasmids.fna']

diff --git a/test/test_bakta.py b/test/test_bakta.py
@@ -9,7 +9,7 @@
 @pytest.mark.slow
 def test_bakta_mock_skipped_features(tmpdir):
     # fast test skipping all feature detections
-    proc = run(['bin/bakta', '--db', 'test/db', '--output', tmpdir, '--prefix', 'test'] + SKIP_PARAMETERS + ['test/data/NC_002127.1.fna'])
+    proc = run(['bin/bakta', '--db', 'test/db', '--output', tmpdir, '--prefix', 'test', '--proteins', 'test/data/user-proteins.faa'] + SKIP_PARAMETERS + ['test/data/NC_002127.1.fna'])
     assert proc.returncode == 0
 
     tmpdir_path = Path(tmpdir)
@@ -20,7 +20,7 @@ def test_bakta_mock_skipped_features(tmpdir):
 @pytest.mark.slow
 def test_bakta_plasmid(tmpdir):
     # full test on plasmid
-    proc = run(['bin/bakta', '--db', 'test/db', '--verbose', '--output', tmpdir, '--prefix', 'test', '--complete', 'test/data/NC_002127.1.fna'])
+    proc = run(['bin/bakta', '--db', 'test/db', '--verbose', '--output', tmpdir, '--prefix', 'test', '--complete', '--proteins', 'test/data/user-proteins.faa', 'test/data/NC_002127.1.fna'])
     assert proc.returncode == 0
 
     tmpdir_path = Path(tmpdir)
@@ -52,7 +52,7 @@ def test_bakta_plasmid(tmpdir):
 @pytest.mark.slow
 def test_bakta_genome(tmpdir):
     # full test on complete genome in compliant mode
-    proc = run(['bin/bakta', '--db', 'test/db', '--verbose', '--output', tmpdir, '--prefix', 'test', '--complete', '--compliant', 'test/data/GCF_000008865.2.fna.gz'])
+    proc = run(['bin/bakta', '--db', 'test/db', '--verbose', '--output', tmpdir, '--prefix', 'test', '--complete', '--compliant', '--proteins', 'test/data/user-proteins.faa', 'test/data/GCF_000008865.2.fna.gz'])
     assert proc.returncode == 0
 
     tmpdir_path = Path(tmpdir)

diff --git a/test/test_user_proteins.py b/test/test_user_proteins.py
@@ -0,0 +1,163 @@
+import json
+
+from pathlib import Path
+from subprocess import run
+
+import bakta.config as cfg
+import bakta.expert.protein_sequences as exp_aa_seq
+
+import pytest
+
+
+SEQUENCE = 'MRADEEPGDLSAVAQDYLKVIWTAQEWSQDKVSTKMLAERIGVSASTASESIRKLAEQGLVDHEKYGAVTLTDSGRRAALAMVRRHRLLETFLVNELGYRWDEVHDEA'
+
+
+aa_min = {
+    'id': 'min',
+    'description': '~~~product~~~',
+    'sequence': SEQUENCE
+}
+aa_min_gene = {
+    'id': 'min',
+    'description': 'gene~~~product~~~',
+    'sequence': SEQUENCE
+}
+aa_min_dbxref = {
+    'id': 'min',
+    'description': 'gene~~~product~~~db-1:id-1',
+    'sequence': SEQUENCE
+}
+aa_min_dbxrefs = {
+    'id': 'min',
+    'description': 'gene~~~product~~~db-1:id-1,db-2:id-2',
+    'sequence': SEQUENCE
+}
+aa_full = {
+    'id': 'full',
+    'description': '90.0~~~80.0~~~80.0~~~gene~~~product~~~db-1:id-1,db-2:id-2',
+    'sequence': SEQUENCE
+}
+
+
+aa_wrong_1 = {
+    'id': 'low-cols',
+    'description': '~~~product',
+    'sequence': SEQUENCE
+}
+aa_wrong_2 = {
+    'id': 'high-cols',
+    'description': '90~~~80~~~80~~~gene~~~product~~~dbxref:dbxref~~~',
+    'sequence': SEQUENCE
+}
+aa_wrong_3 = {
+    'id': 'no-product',
+    'description': 'gene~~~~~~dbxref:dbxref',
+    'sequence': SEQUENCE
+}
+aa_wrong_4 = {
+    'id': 'no-product-full',
+    'description': '90~~~80~~~80~~~gene~~~~~~dbxref:dbxref',
+    'sequence': SEQUENCE
+}
+aa_wrong_5 = {
+    'id': 'wrong-dbxref',
+    'description': 'gene~~~product~~~dbxrefdbxref',
+    'sequence': SEQUENCE
+}
+aa_wrong_6 = {
+    'id': 'wrong-dbxref-full',
+    'description': '90~~~80~~~80~~~gene~~~product~~~dbxrefdbxref',
+    'sequence': SEQUENCE
+}
+aa_wrong_7 = {
+    'id': 'wrong-id',
+    'description': 'ninety~~~80~~~80~~~gene~~~product~~~dbxref:dbxref',
+    'sequence': SEQUENCE
+}
+aa_wrong_8 = {
+    'id': 'wrong-min-query-cov',
+    'description': '90~~~eighty~~~80~~~gene~~~product~~~dbxref:dbxref',
+    'sequence': SEQUENCE
+}
+aa_wrong_9 = {
+    'id': 'wrong-min-model-cov',
+    'description': '90~~~80~~~eighty~~~gene~~~product~~~dbxref:dbxref',
+    'sequence': SEQUENCE
+}
+
+
+@pytest.mark.parametrize(
+    "aa",
+    [
+        (aa_wrong_1),
+        (aa_wrong_2),
+        (aa_wrong_3),
+        (aa_wrong_4),
+        (aa_wrong_5),
+        (aa_wrong_6),
+        (aa_wrong_7),
+        (aa_wrong_8),
+        (aa_wrong_9)
+    ]
+)
+def test_wrong_user_proteins_io(tmpdir, aa):
+    tmpdir = Path(tmpdir)
+    cfg.user_proteins = tmpdir.joinpath('user.faa')
+    write_tmp_faa(aa, cfg.user_proteins)
+
+    user_proteins_path = tmpdir.joinpath('user-clean.faa')
+    with pytest.raises(SystemExit) as pytest_wrapped_e:
+        exp_aa_seq.write_user_protein_sequences(user_proteins_path)
+    assert pytest_wrapped_e.type == SystemExit
+
+
+@pytest.mark.parametrize(
+    "aa",
+    [
+        (aa_min),
+        (aa_min_gene),
+        (aa_min_dbxref),
+        (aa_min_dbxrefs),
+        (aa_full)
+    ]
+)
+def test_user_proteins_io(tmpdir, aa):
+    tmpdir = Path(tmpdir)
+    cfg.user_proteins = tmpdir.joinpath('user.faa')
+    write_tmp_faa(aa, cfg.user_proteins)
+
+    user_proteins_path = tmpdir.joinpath('user-clean.faa')
+    exp_aa_seq.write_user_protein_sequences(user_proteins_path)
+
+
+def write_tmp_faa(aa, aa_path):
+    with aa_path.open('w') as fh:
+        fh.write(f">{aa['id']} {aa['description']}\n")
+        fh.write(aa['sequence'])
+        fh.write('\n')
+
+
+@pytest.mark.slow
+def test_user_proteins(tmpdir):
+    # fast test skipping all feature detections
+    proc = run(
+        [
+            'bin/bakta', '--db', 'test/db', '--output', tmpdir, '--prefix', 'test', '--proteins', 'test/data/user-proteins.faa',
+            '--skip-tmrna', '--skip-trna', '--skip-rrna', '--skip-ncrna', '--skip-ncrna-region', '--skip-crispr', '--skip-sorf', '--skip-ori', '--skip-gap', 
+            'test/data/NC_002127.1.fna'
+        ]
+    )
+    assert proc.returncode == 0
+
+    tmpdir_path = Path(tmpdir)
+    results_path = Path(tmpdir_path.joinpath('test.json'))
+    assert Path.exists(results_path)
+    results = None
+    with results_path.open() as fh:
+        results = json.load(fh)
+    assert results is not None
+    user_prot_feats = []
+    for feat in results['features']:
+        if('expert' in feat and 'user_proteins' in feat['expert']):
+            user_prot_feats.append(feat)
+    assert len(user_prot_feats) == 1