Skip to content

Commit

Permalink
relax locus-tag rules #92
Browse files Browse the repository at this point in the history
  • Loading branch information
oschwengers committed Jan 27, 2022
1 parent f526d10 commit e2651c3
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 15 deletions.
23 changes: 14 additions & 9 deletions bakta/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,11 @@ def setup(args):
log.info('translation_table=%s', translation_table)
gram = args.gram
log.info('gram=%s', gram)
compliant = args.compliant
log.info('compliant=%s', compliant)
if(compliant):
min_contig_length = 200
log.info('compliant mode! min_contig_length=%s', min_contig_length)
locus = args.locus
if(locus is not None):
if(locus == ''):
Expand All @@ -214,7 +219,7 @@ def setup(args):
sys.exit(f"ERROR: whitespace character ({locus}) in 'locus' parameter!")
if(bc.RE_INSDC_ID_PREFIX.fullmatch(locus) is None):
log.error("Invalid 'locus' parameter! locus=%s", locus)
sys.exit(f"ERROR: invalid 'locus' parameter ({locus})!\nLocus prefixes must contain between 1 and 20 alphanumeric and '-_' characters.")
sys.exit(f"ERROR: invalid 'locus' parameter ({locus})!\nLocus prefixes must contain between 1 and 20 alphanumeric or '-_' characters.")
log.info('locus=%s', locus)
locus_tag = args.locus_tag
if(locus_tag is not None):
Expand All @@ -224,9 +229,14 @@ def setup(args):
if(' ' in locus_tag):
log.error("Whitespace character in 'locus-tag' parameter! locus-tag=%s", locus_tag)
sys.exit(f"ERROR: whitespace character ({locus_tag}) in 'locus-tag' parameter!")
if(bc.RE_INSDC_LOCUSTAG_PREFIX.fullmatch(locus_tag) is None):
log.error("Invalid 'locus-tag' parameter! locus-tag=%s", locus_tag)
sys.exit(f"ERROR: invalid 'locus-tag' parameter ({locus_tag})!\nLocus tag prefixes must contain between 3 and 12 alphanumeric uppercase characters and start with a letter.")
if(compliant):
if(bc.RE_INSDC_LOCUSTAG_PREFIX.fullmatch(locus_tag) is None):
log.error("INSDC-incompliant 'locus-tag' parameter! locus-tag=%s", locus_tag)
sys.exit(f"ERROR: INSDC-incompliant 'locus-tag' parameter ({locus_tag})!\nINSDC Locus tag prefixes must contain between 3 and 12 alphanumeric uppercase characters and start with a letter.")
else:
if(bc.RE_LOCUSTAG_PREFIX.fullmatch(locus_tag) is None):
log.error("Invalid 'locus-tag' parameter! locus-tag=%s", locus_tag)
sys.exit(f"ERROR: invalid 'locus-tag' parameter ({locus_tag})!\nLocus tag prefixes must contain between 1 and 24 alphanumeric characters or '_.-' signs.")
log.info('locus-tag=%s', locus_tag)
keep_contig_headers = args.keep_contig_headers
log.info('keep_contig_headers=%s', keep_contig_headers)
Expand All @@ -243,11 +253,6 @@ def setup(args):
log.error('provided replicon file not valid! path=%s', replicons)
sys.exit(f'ERROR: replicon table file ({replicons}) not valid!')
log.info('replicon-table=%s', replicons)
compliant = args.compliant
log.info('compliant=%s', compliant)
if(compliant):
min_contig_length = 200
log.info('compliant mode! min_contig_length=%s', min_contig_length)
user_proteins = args.proteins
if(user_proteins is not None):
try:
Expand Down
1 change: 1 addition & 0 deletions bakta/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
RE_MULTIWHITESPACE = re.compile(r'\s+')
RE_INSDC_ID_PREFIX = re.compile(r'[A-Za-z\d_.:*#-]{1,20}') # https://www.ncbi.nlm.nih.gov/WebSub/html/help/fasta.html
RE_INSDC_ID = re.compile(r'[A-Za-z\d_.:*#-]{1,25}') # https://www.ncbi.nlm.nih.gov/WebSub/html/help/fasta.html
RE_LOCUSTAG_PREFIX = re.compile(r'[A-Za-z\d_.-]{1,24}') # https://ena-docs.readthedocs.io/en/latest/faq/locus_tags.html
RE_INSDC_LOCUSTAG_PREFIX = re.compile(r'[A-Z][A-Z0-9]{2,11}') # https://ena-docs.readthedocs.io/en/latest/faq/locus_tags.html

############################################################################
Expand Down
81 changes: 75 additions & 6 deletions test/test_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,67 @@ def test_locus_ok(parameters, tmpdir):
assert proc.returncode == 0


@pytest.mark.parametrize(
'parameters',
[
(['--locus-tag']), # not provided
(['--locus-tag', '']), # empty
(['--locus-tag', ' ']), # whitespace only
(['--locus-tag', ' ']), # whitespaces only
(['--locus-tag', 'ABCDEFGHIJKLMNOPQRSTUVWXZ']), # more than 24 characters
(['--locus-tag', 'ABC!']), # wrong characters
(['--locus-tag', 'ABC?']), # wrong characters
(['--locus-tag', 'ABC*']), # wrong characters
(['--locus-tag', 'ABC,']), # wrong characters
(['--locus-tag', 'ABC;']), # wrong characters
(['--locus-tag', 'ABC:']), # wrong characters
(['--locus-tag', 'ABC§']), # wrong characters
(['--locus-tag', 'ABC$']), # wrong characters
(['--locus-tag', 'ABC%']), # wrong characters
(['--locus-tag', 'ABC&']), # wrong characters
(['--locus-tag', 'ABC/']), # wrong characters
(['--locus-tag', 'ABC=']), # wrong characters
(['--locus-tag', 'ABC#']) # wrong characters
]
)
def test_locustag_failiing(parameters, tmpdir):
# test locus-tag prefix arguments
proc = run(
['bin/bakta', '--db', 'test/db', '--output', tmpdir] +
parameters +
SKIP_PARAMETERS +
['test/data/NC_002127.1.fna']
)
assert proc.returncode != 0


@pytest.mark.slow
@pytest.mark.parametrize(
'parameters',
[
(['--locus-tag', 'A']),
(['--locus-tag', '1']),
(['--locus-tag', 'ABCDEFGHIJKLMNOPQRSTUVWX']),
(['--locus-tag', 'A12']),
(['--locus-tag', 'ABC.']),
(['--locus-tag', 'ABC-']),
(['--locus-tag', 'ABC_']),
(['--locus-tag', 'GCF_014267685.1']),
(['--locus-tag', 'ASM25969v1']),
(['--locus-tag', 'DAESDI010000001.1'])
]
)
def test_locustag_ok(parameters, tmpdir):
# test locus-tag prefix arguments
proc = run(
['bin/bakta', '--db', 'test/db', '--output', tmpdir] +
parameters +
SKIP_PARAMETERS +
['test/data/NC_002127.1.fna']
)
assert proc.returncode == 0


@pytest.mark.parametrize(
'parameters',
[
Expand All @@ -302,18 +363,26 @@ def test_locus_ok(parameters, tmpdir):
(['--locus-tag', 'ABCDEFGHIJKLM']), # more than 12 characters
(['--locus-tag', 'ABC_']), # wrong characters
(['--locus-tag', 'ABC-']), # wrong characters
(['--locus-tag', 'ABC.']), # wrong characters
(['--locus-tag', 'ABC!']), # wrong characters
(['--locus-tag', 'ABC?']), # wrong characters
(['--locus-tag', 'ABC*']), # wrong characters
(['--locus-tag', 'ABC.']), # wrong characters
(['--locus-tag', 'ABC,']), # wrong characters
(['--locus-tag', 'ABC;']) # wrong characters
(['--locus-tag', 'ABC;']), # wrong characters
(['--locus-tag', 'ABC:']), # wrong characters
(['--locus-tag', 'ABC§']), # wrong characters
(['--locus-tag', 'ABC$']), # wrong characters
(['--locus-tag', 'ABC%']), # wrong characters
(['--locus-tag', 'ABC&']), # wrong characters
(['--locus-tag', 'ABC/']), # wrong characters
(['--locus-tag', 'ABC=']), # wrong characters
(['--locus-tag', 'ABC#']) # wrong characters
]
)
def test_locustag_failiing(parameters, tmpdir):
def test_locustag_compliant_failiing(parameters, tmpdir):
# test locus-tag prefix arguments
proc = run(
['bin/bakta', '--db', 'test/db', '--output', tmpdir] +
['bin/bakta', '--db', 'test/db', '--output', tmpdir, '--compliant'] +
parameters +
SKIP_PARAMETERS +
['test/data/NC_002127.1.fna']
Expand All @@ -331,10 +400,10 @@ def test_locustag_failiing(parameters, tmpdir):
(['--locus-tag', 'A23456789012'])
]
)
def test_locustag_ok(parameters, tmpdir):
def test_locustag_compliant_ok(parameters, tmpdir):
# test locus-tag prefix arguments
proc = run(
['bin/bakta', '--db', 'test/db', '--output', tmpdir] +
['bin/bakta', '--db', 'test/db', '--output', tmpdir, '--compliant'] +
parameters +
SKIP_PARAMETERS +
['test/data/NC_002127.1.fna']
Expand Down

0 comments on commit e2651c3

Please sign in to comment.