Skip to content

Commit

Permalink
fix common INSDC submission errors on CDS feature products (#334)
Browse files Browse the repository at this point in the history
* add whitespace replacement
* remove # from products
* remove suspect first character
* remove remnant of terms
* discrad pure tmRNA CDS product descriptions
  • Loading branch information
oschwengers authored Oct 10, 2024
1 parent 2d1ca75 commit 9bd2227
Showing 1 changed file with 26 additions and 2 deletions.
28 changes: 26 additions & 2 deletions bakta/features/annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,12 @@
RE_PROTEIN_NODE = re.compile(r'NODE_', flags=re.IGNORECASE)
RE_PROTEIN_POTENTIAL_CONTIG_NAME = re.compile(r'(genome|shotgun)', flags=re.IGNORECASE)
RE_PROTEIN_DOMAIN_CONTAINING = re.compile(r'domain-containing protein', flags=re.IGNORECASE)
RE_PROTEIN_REMNANT = re.compile(r'Remnant of ', re.IGNORECASE)
RE_PROTEIN_TMRNA = re.compile(r'TmRNA', flags=re.IGNORECASE)
RE_PROTEIN_NO_LETTERS = re.compile(r'[^A-Za-z]')
RE_PROTEIN_SUSPECT_CHARS = re.compile(r'[.@=?%]')
RE_PROTEIN_SUSPECT_CHARS_DISCARD = re.compile(r'[.#]')
RE_PROTEIN_SUSPECT_CHARS_REPLACE = re.compile(r'[@=?%]')
RE_PROTEIN_SUSPECT_CHARS_BEGINNING = '_\-+.:,;/\\\''
RE_PROTEIN_PERIOD_SEPARATOR = re.compile(r'([a-zA-Z0-9]+)\.([a-zA-Z0-9]+)')
RE_PROTEIN_WRONG_PRIMES = re.compile(r'[\u2032\u0060\u00B4]') # prime (′), grave accent (`), acute accent (´)
RE_PROTEIN_WEIGHT = re.compile(r' [0-9]+(?:\.[0-9]+)? k?da ', flags=re.IGNORECASE)
Expand Down Expand Up @@ -536,9 +540,19 @@ def revise_cds_product(product: str):
product = re.sub(RE_PROTEIN_PERIOD_SEPARATOR, r'\1-\2', product) # replace separator periods
if(product != old_product):
log.info('fix product: replace separator periods. new=%s, old=%s', product, old_product)

old_product = product
if(product[0] in RE_PROTEIN_SUSPECT_CHARS_BEGINNING): # remove suspect first character
product = product[1:]
log.info('fix product: replace invalid first character. new=%s, old=%s', product, old_product)

old_product = product
product = RE_PROTEIN_SUSPECT_CHARS_DISCARD.sub('', product) # remove suspect characters
if(product != old_product):
log.info('fix product: replace invalid characters. new=%s, old=%s', product, old_product)

old_product = product
product = RE_PROTEIN_SUSPECT_CHARS.sub('', product) # remove suspect characters
product = RE_PROTEIN_SUSPECT_CHARS_REPLACE.sub(' ', product) # replace suspect characters by single whitespace
if(product != old_product):
log.info('fix product: replace invalid characters. new=%s, old=%s', product, old_product)

Expand All @@ -552,6 +566,11 @@ def revise_cds_product(product: str):
if(product != old_product):
log.info('fix product: replace FOG ids. new=%s, old=%s', product, old_product)

old_product = product
product = RE_PROTEIN_REMNANT.sub('', product) # remove 'Remnant of's
if(product != old_product):
log.info('fix product: replace remnant ofs. new=%s, old=%s', product, old_product)

old_product = product
dufs = [] # replace DUF-containing products
for m in RE_DOMAIN_OF_UNKNOWN_FUNCTION.finditer(product):
Expand Down Expand Up @@ -593,6 +612,11 @@ def revise_cds_product(product: str):
product = product.replace('_', '-')
if(product != old_product):
log.info('fix product: replace domain name underscores. new=%s, old=%s', product, old_product)

old_product = product
if(RE_PROTEIN_TMRNA.fullmatch(product)):
product = ''
log.info('fix product: discard pure tmRNA product descriptions. new=%s, old=%s', product, old_product)

old_product = product
if(
Expand Down

0 comments on commit 9bd2227

Please sign in to comment.