diff --git a/bakta/config.py b/bakta/config.py index 52e5310c..fc8c3161 100644 --- a/bakta/config.py +++ b/bakta/config.py @@ -208,6 +208,9 @@ def setup(args): if(' ' in locus): log.error("Whitespace character in 'locus' parameter! locus=%s", locus) sys.exit(f"ERROR: whitespace character ({locus}) in 'locus' parameter!") + if(bc.RE_INSDC_ID_PREFIX.fullmatch(locus) is None): + log.error("Invalid 'locus' parameter! locus=%s", locus) + sys.exit(f"ERROR: invalid 'locus' parameter ({locus})!\nLocus prefixes must contain between 1 and 20 alphanumeric and '-_' characters.") log.info('locus=%s', locus) locus_tag = args.locus_tag if(locus_tag is not None): diff --git a/bakta/constants.py b/bakta/constants.py index 1199c75e..49291f7a 100644 --- a/bakta/constants.py +++ b/bakta/constants.py @@ -5,6 +5,8 @@ # Common regex ############################################################################ RE_MULTIWHITESPACE = re.compile(r'\s+') +RE_INSDC_ID_PREFIX = re.compile(r'[A-Za-z\d_.:*#-]{1,20}') # https://www.ncbi.nlm.nih.gov/WebSub/html/help/fasta.html +RE_INSDC_ID = re.compile(r'[A-Za-z\d_.:*#-]{1,25}') # https://www.ncbi.nlm.nih.gov/WebSub/html/help/fasta.html RE_INSDC_LOCUSTAG_PREFIX = re.compile(r'[A-Z][A-Z0-9]{2,11}') # https://ena-docs.readthedocs.io/en/latest/faq/locus_tags.html ############################################################################ diff --git a/bakta/utils.py b/bakta/utils.py index ac648c83..45a46ebd 100644 --- a/bakta/utils.py +++ b/bakta/utils.py @@ -43,9 +43,6 @@ def print_version(self): DEPENDENCY_AMRFINDERPLUS = (Version(3, 10, 16), Version(VERSION_MAX_DIGIT, VERSION_MAX_DIGIT, VERSION_MAX_DIGIT), VERSION_REGEX, ('amrfinder', '--version'), ['--skip-cds']) -INSDC_ID_REGEX = re.compile(r'[^A-Za-z\d_.:*#-]') # https://www.ncbi.nlm.nih.gov/WebSub/html/help/fasta.html - - def init_parser(): parser = argparse.ArgumentParser( prog='bakta', @@ -441,8 +438,7 @@ def qc_contigs(contigs, replicons): if(len(contig['id']) > 25): # max 25 characters log.error('INSDC compliance: contig id larger than 25! contig-id=%s', contig['id']) sys.exit(f"ERROR: INSDC compliance failed! Contig ID ({contig['id']}) larger than 25 characers!") - unvalid_char_match = INSDC_ID_REGEX.search(contig['id']) - if(unvalid_char_match is not None): # invalid characters + if(bc.RE_INSDC_ID.fullmatch(contig['id']) is None): # invalid characters log.error('INSDC compliance: contig id contains invalid characters! contig-id=%s', contig['id']) sys.exit(f"ERROR: INSDC compliance failed! Contig ID ({contig['id']}) contains invalid characters!")