Skip to content

Commit

Permalink
add locus prefix checks
Browse files Browse the repository at this point in the history
  • Loading branch information
oschwengers committed Nov 23, 2021
1 parent 8e27f68 commit 8c58851
Show file tree
Hide file tree
Showing 3 changed files with 6 additions and 5 deletions.
3 changes: 3 additions & 0 deletions bakta/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,9 @@ def setup(args):
if(' ' in locus):
log.error("Whitespace character in 'locus' parameter! locus=%s", locus)
sys.exit(f"ERROR: whitespace character ({locus}) in 'locus' parameter!")
if(bc.RE_INSDC_ID_PREFIX.fullmatch(locus) is None):
log.error("Invalid 'locus' parameter! locus=%s", locus)
sys.exit(f"ERROR: invalid 'locus' parameter ({locus})!\nLocus prefixes must contain between 1 and 20 alphanumeric and '-_' characters.")
log.info('locus=%s', locus)
locus_tag = args.locus_tag
if(locus_tag is not None):
Expand Down
2 changes: 2 additions & 0 deletions bakta/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
# Common regex
############################################################################
RE_MULTIWHITESPACE = re.compile(r'\s+')
RE_INSDC_ID_PREFIX = re.compile(r'[A-Za-z\d_.:*#-]{1,20}') # https://www.ncbi.nlm.nih.gov/WebSub/html/help/fasta.html
RE_INSDC_ID = re.compile(r'[A-Za-z\d_.:*#-]{1,25}') # https://www.ncbi.nlm.nih.gov/WebSub/html/help/fasta.html
RE_INSDC_LOCUSTAG_PREFIX = re.compile(r'[A-Z][A-Z0-9]{2,11}') # https://ena-docs.readthedocs.io/en/latest/faq/locus_tags.html

############################################################################
Expand Down
6 changes: 1 addition & 5 deletions bakta/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,6 @@ def print_version(self):
DEPENDENCY_AMRFINDERPLUS = (Version(3, 10, 16), Version(VERSION_MAX_DIGIT, VERSION_MAX_DIGIT, VERSION_MAX_DIGIT), VERSION_REGEX, ('amrfinder', '--version'), ['--skip-cds'])


INSDC_ID_REGEX = re.compile(r'[^A-Za-z\d_.:*#-]') # https://www.ncbi.nlm.nih.gov/WebSub/html/help/fasta.html


def init_parser():
parser = argparse.ArgumentParser(
prog='bakta',
Expand Down Expand Up @@ -441,8 +438,7 @@ def qc_contigs(contigs, replicons):
if(len(contig['id']) > 25): # max 25 characters
log.error('INSDC compliance: contig id larger than 25! contig-id=%s', contig['id'])
sys.exit(f"ERROR: INSDC compliance failed! Contig ID ({contig['id']}) larger than 25 characers!")
unvalid_char_match = INSDC_ID_REGEX.search(contig['id'])
if(unvalid_char_match is not None): # invalid characters
if(bc.RE_INSDC_ID.fullmatch(contig['id']) is None): # invalid characters
log.error('INSDC compliance: contig id contains invalid characters! contig-id=%s', contig['id'])
sys.exit(f"ERROR: INSDC compliance failed! Contig ID ({contig['id']}) contains invalid characters!")

Expand Down

0 comments on commit 8c58851

Please sign in to comment.