Skip to content

Commit

Permalink
replace underscores in domains #69
Browse files Browse the repository at this point in the history
  • Loading branch information
oschwengers committed Aug 31, 2021
1 parent 3787e01 commit f8ec2aa
Showing 1 changed file with 6 additions and 0 deletions.
6 changes: 6 additions & 0 deletions bakta/features/annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
RE_PROTEIN_PUTATIVE = re.compile(r'(potential|possible|probable|predicted)', flags=re.IGNORECASE)
RE_PROTEIN_NODE = re.compile(r'NODE_', flags=re.IGNORECASE)
RE_PROTEIN_POTENTIAL_CONTIG_NAME = re.compile(r'(genome|shotgun)', flags=re.IGNORECASE)
RE_PROTEIN_DOMAIN_CONTAINING = re.compile(r'domain-containing protein', flags=re.IGNORECASE)
RE_PROTEIN_NO_LETTERS = re.compile(r'[^A-Za-z]')
RE_PROTEIN_SYMBOL = re.compile(r'[A-Z][a-z]{2}[A-Z][0-9]?')

Expand Down Expand Up @@ -448,6 +449,11 @@ def revise_cds_product(feature):
product = product.replace('=', '')
log.info('fix product: remove = character. new=%s, old=%s', product, old_product)

old_product = product
if(RE_PROTEIN_DOMAIN_CONTAINING.search(product)): # replace underscores in domain names
product = product.replace('_', '-')
log.info('fix product: replace underscores. new=%s, old=%s', product, old_product)

old_product = product
if(RE_PROTEIN_CONTIG.search(product)):
product = bc.HYPOTHETICAL_PROTEIN
Expand Down

0 comments on commit f8ec2aa

Please sign in to comment.