From e7f00df0ad3b4869a5d3cd562a2a15e887aae0eb Mon Sep 17 00:00:00 2001
From: Oliver Schwengers <oliver.schwengers@computational.bio.uni-giessen.de>
Date: Thu, 18 Jan 2024 09:46:28 +0100
Subject: [PATCH] Update db build scripts (#270)

* update MOB-suite to v3.1.8
* update UniParc file download
* bump Diamond to v2.1.8
* bump HMMER to v3.4
* URL updates and minor fixes
* add EC & GO terms and update to new NCBIFams file format
---
 db-scripts/annotate-ncbi-fams.py | 83 ++++++++++++++++++++++++++++----
 db-scripts/buid-db.sh            | 18 ++++---
 db-scripts/diamond.nf            |  2 +-
 db-scripts/hmmsearch.nf          |  2 +-
 db-scripts/init-pscc.py          |  4 +-
 5 files changed, 89 insertions(+), 20 deletions(-)

diff --git a/db-scripts/annotate-ncbi-fams.py b/db-scripts/annotate-ncbi-fams.py
index d6d3e481..72196e1b 100644
--- a/db-scripts/annotate-ncbi-fams.py
+++ b/db-scripts/annotate-ncbi-fams.py
@@ -40,15 +40,42 @@
 with hmms_path.open() as fh, alive_bar() as bar:
     for line in fh:
         (
-            ncbi_accession, source_identifier, label, sequence_cutoff, domain_cutoff, hmm_length, family_type, for_structural_annotation, for_naming, for_AMRFinder, product_name, gene_symbol, ec_numbers, go_terms, pmids, taxonomic_range, taxonomic_range_name, taxonomic_rank_name, n_refseq_protein_hits, source, name_orig
+            ncbi_accession, 
+            source_identifier, 
+            label, 
+            sequence_cutoff, 
+            domain_cutoff, 
+            hmm_length, 
+            family_type, 
+            for_structural_annotation, 
+            for_naming, 
+            for_AMRFinder, 
+            product_name, 
+            gene_symbol, 
+            ec_numbers, 
+            go_terms, 
+            pmids, 
+            taxonomic_range, 
+            taxonomic_range_name, 
+            taxonomic_rank_name, 
+            n_refseq_protein_hits, 
+            source, 
+            name_orig,
+            _1,
+            _2
         ) = line.split('\t')
         if(family_type in family_type_ranks.keys() and for_naming == 'Y'):  # only accept exception/equivalog HMMs eligible for naming proteins
             hmm = {
                 'acc': ncbi_accession,
-                'gene': gene_symbol,
                 'type': family_type,
                 'product': product_name.strip()
             }
+            if(gene_symbol != ''):
+                hmm['gene'] = gene_symbol
+            if(ec_numbers != ''):
+                hmm['ecs'] = ec_numbers.split(',')
+            if(go_terms != ''):
+                hmm['gos'] = [term.split(':')[1] for term in go_terms.split(',')]
             if(ncbi_accession != '-'):
                 hmms[ncbi_accession] = hmm
         bar()
@@ -86,6 +113,9 @@
 print('\n')
 
 psc_annotated = 0
+psc_with_gene = 0
+psc_with_ec = 0
+psc_with_go = 0
 with sqlite3.connect(str(db_path), isolation_level='EXCLUSIVE') as conn, alive_bar(total=len(hit_per_psc)) as bar:
     conn.execute('PRAGMA page_size = 4096;')
     conn.execute('PRAGMA cache_size = 100000;')
@@ -95,16 +125,51 @@
     conn.execute('PRAGMA journal_mode = OFF')
     conn.execute('PRAGMA threads = 2;')
     conn.commit()
+    conn.row_factory = sqlite3.Row
     for psc_id, hit in hit_per_psc.items():
         hmm = hmms.get(hit['hmm_id'], None)
         if(hmm is not None):
-            if(hmm['gene'] == ''):
-                conn.execute('UPDATE psc SET product=? WHERE uniref90_id=?', (hmm['product'], psc_id))  # annotate PSC
-                log_psc.info('UPDATE psc SET product=%s WHERE uniref90_id=%s', hmm['product'], psc_id)
-            else:
-                conn.execute('UPDATE psc SET gene=?, product=? WHERE uniref90_id=?', (hmm['gene'], hmm['product'], psc_id))  # annotate PSC
-                log_psc.info('UPDATE psc SET gene=%s, product=%s WHERE uniref90_id=%s', hmm['gene'], hmm['product'], psc_id)
+            conn.execute('UPDATE psc SET product=? WHERE uniref90_id=?', (hmm['product'], psc_id))  # annotate PSC
+            log_psc.info('UPDATE psc SET product=%s WHERE uniref90_id=%s', hmm['product'], psc_id)
+            if('gene' in hmm):
+                conn.execute('UPDATE psc SET gene=? WHERE uniref90_id=?', (hmm['gene'], psc_id))  # annotate PSC
+                log_psc.info('UPDATE psc SET gene=%s WHERE uniref90_id=%s', hmm['gene'], psc_id)
+                psc_with_gene += 1
+            if('ecs' in hmm):
+                rec_psc = conn.execute('SELECT ec_ids FROM psc WHERE uniref90_id=?', (psc_id,)).fetchone()
+                if(rec_psc is None):
+                    continue
+                if(rec_psc['ec_ids'] is not None):
+                    ecs = set(rec_psc['ec_ids'].split(','))
+                    for ec in hmm['ecs']:
+                        ecs.add(ec)
+                else:
+                    ecs = hmm['ecs']
+                ec_list = ','.join(sorted(ecs))
+                conn.execute('UPDATE psc SET ec_ids=? WHERE uniref90_id=?', (ec_list, psc_id))  # annotate PSC
+                log_psc.info('UPDATE psc SET ec_ids=%s WHERE uniref90_id=%s', ec_list, psc_id)
+                psc_with_ec += 1
+            if('gos' in hmm):
+                rec_psc = conn.execute('SELECT go_ids FROM psc WHERE uniref90_id=?', (psc_id,)).fetchone()
+                if(rec_psc is None):
+                    continue
+                if(rec_psc['go_ids'] is not None):
+                    gos = set(rec_psc['go_ids'].split(','))
+                    for go in hmm['gos']:
+                        gos.add(go)
+                else:
+                    gos = hmm['gos']
+                go_list = ','.join(sorted(gos))
+                conn.execute('UPDATE psc SET go_ids=? WHERE uniref90_id=?', (go_list, psc_id))  # annotate PSC
+                log_psc.info('UPDATE psc SET go_ids=%s WHERE uniref90_id=%s', go_list, psc_id)
+                psc_with_go += 1
             psc_annotated += 1
         bar()
-print(f'PSCs with annotated gene / product: {psc_annotated}')
+print(f'PSCs with annotated product: {psc_annotated}')
 log_psc.debug('summary: PSC annotated=%d', psc_annotated)
+print(f'PSCs with annotated gene: {psc_with_gene}')
+log_psc.debug('summary: PSC gene annotated=%d', psc_with_gene)
+print(f'PSCs with annotated EC: {psc_with_ec}')
+log_psc.debug('summary: PSC EC annotated=%d', psc_with_ec)
+print(f'PSCs with annotated GO: {psc_with_go}')
+log_psc.debug('summary: PSC GO annotated=%d', psc_with_go)
diff --git a/db-scripts/buid-db.sh b/db-scripts/buid-db.sh
index f2cc35af..812ea653 100644
--- a/db-scripts/buid-db.sh
+++ b/db-scripts/buid-db.sh
@@ -62,7 +62,7 @@ rm -r antifam antifam-dir/
 
 # download & extract oriT sequences
 printf "\n5/20: download and extract oriT sequences from Mob-suite ...\n"
-wget https://zenodo.org/record/3786915/files/data.tar.gz
+wget https://zenodo.org/records/10304948/files/data.tar.gz
 tar -xvzf data.tar.gz
 mv data/orit.fas ./orit.fna
 rm -r data/ data.tar.gz
@@ -109,9 +109,13 @@ python3 ${BAKTA_DB_SCRIPTS}/init-db.py --db bakta.db
 ############################################################################
 printf "\n8/20: download UniProt UniRef50 ...\n"
 wget https://ftp.expasy.org/databases/uniprot/current_release/uniref/uniref50/uniref50.xml.gz
-wget https://ftp.expasy.org/databases/uniprot/current_release/uniparc/uniparc_active.fasta.gz
+for i in {1..200}; do
+    wget https://ftp.expasy.org/databases/uniprot/current_release/uniparc/fasta/active/uniparc_active_p${i}.fasta.gz
+    pigz -dc uniparc_active_p${i}.fasta.gz >> uniparc_active.fasta
+    rm uniparc_active_p${i}.fasta.gz
+done
 printf "\n8/20: read UniRef90 entries and build Protein Sequence Cluster sequence and information databases:\n"
-python3 ${BAKTA_DB_SCRIPTS}/init-pscc.py --taxonomy nodes.dmp --uniref50 uniref50.xml.gz --uniparc uniparc_active.fasta.gz --db bakta.db --pscc pscc.faa --sorf pscc_sorf.faa
+python3 ${BAKTA_DB_SCRIPTS}/init-pscc.py --taxonomy nodes.dmp --uniref50 uniref50.xml.gz --uniparc uniparc_active.fasta --db bakta.db --pscc pscc.faa --pscc_sorf pscc_sorf.faa
 printf "\n8/20: build PSCC Diamond db ...\n"
 diamond makedb --in pscc.faa --db pscc
 diamond makedb --in pscc_sorf.faa --db sorf
@@ -133,7 +137,7 @@ rm uniref50.xml.gz
 printf "\n9/20: download UniProt UniRef90 ...\n"
 wget https://ftp.expasy.org/databases/uniprot/current_release/uniref/uniref90/uniref90.xml.gz
 printf "\n9/20: read UniRef90 entries and build Protein Sequence Cluster sequence and information databases:\n"
-python3 ${BAKTA_DB_SCRIPTS}/init-psc.py --taxonomy nodes.dmp --uniref90 uniref90.xml.gz --uniparc uniparc_active.fasta.gz --db bakta.db --psc psc.faa --sorf sorf.faa
+python3 ${BAKTA_DB_SCRIPTS}/init-psc.py --taxonomy nodes.dmp --uniref90 uniref90.xml.gz --uniparc uniparc_active.fasta --db bakta.db --psc psc.faa --psc_sorf sorf.faa
 printf "\n9/20: build PSC Diamond db ...\n"
 diamond makedb --in psc.faa --db psc
 diamond makedb --in sorf.faa --db sorf
@@ -148,7 +152,7 @@ rm uniref90.xml.gz
 printf "\n10/20: download UniProt UniRef100 ...\n"
 wget https://ftp.expasy.org/databases/uniprot/current_release/uniref/uniref100/uniref100.xml.gz
 printf "\n10/20: read, filter and store UniRef100 entries ...:\n"
-python3 ${BAKTA_DB_SCRIPTS}/init-ups-ips.py --taxonomy nodes.dmp --uniref100 uniref100.xml.gz --uniparc uniparc_active.fasta.gz --db bakta.db --ips ips.faa
+python3 ${BAKTA_DB_SCRIPTS}/init-ups-ips.py --taxonomy nodes.dmp --uniref100 uniref100.xml.gz --uniparc uniparc_active.fasta --db bakta.db --ips ips.faa
 rm uniref100.xml.gz uniparc_active.fasta.gz
 
 
@@ -224,7 +228,7 @@ rm refseq-bacteria-nrp.trimmed.faa PCLA_proteins.txt PCLA_clusters.txt
 # - annotate IPSs if IPS have no PSC UniRef90 identifier (seq -> hash -> UPS -> IPS)
 ############################################################################
 printf "\n14/20: download UniProt/SwissProt ...\n"
-wget https://ftp.expasy.org/databases/swiss-prot/release/uniprot_sprot.xml.gz
+wget https://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz
 printf "\n14/20: annotate IPSs and PSCs ...\n"
 python3 ${BAKTA_DB_SCRIPTS}/annotate-swissprot.py --taxonomy nodes.dmp --xml uniprot_sprot.xml.gz --db bakta.db
 rm uniprot_sprot.xml.gz
@@ -241,7 +245,7 @@ wget https://ftp.ncbi.nlm.nih.gov/hmm/current/hmm_PGAP.tsv
 grep -v "(Provisional)" hmm_PGAP.tsv > hmms.non-prov.tsv
 grep exception hmms.non-prov.tsv > hmms.ncbi.selected.tsv
 grep equivalog hmms.non-prov.tsv >> hmms.ncbi.selected.tsv
-cut -f1 hmms.ncbi.selected.tsv > hmms.ids.txt
+sort hmms.ncbi.selected.tsv | uniq | cut -f1 > hmms.ids.txt
 hmmfetch -f -o ncbifams hmm_PGAP.LIB hmms.ids.txt
 hmmpress ncbifams
 printf "\n15/20: annotate PSCs...\n"
diff --git a/db-scripts/diamond.nf b/db-scripts/diamond.nf
index 41865809..8bf8fbf1 100644
--- a/db-scripts/diamond.nf
+++ b/db-scripts/diamond.nf
@@ -27,7 +27,7 @@ process diamond {
     maxRetries 3
     cpus 8
     memory '32 GB'
-    conda 'diamond=2.0.14'
+    conda 'diamond=2.1.8'
 
     input:
     file('input.faa') from chAAs
diff --git a/db-scripts/hmmsearch.nf b/db-scripts/hmmsearch.nf
index 638f9197..ba5af8ec 100644
--- a/db-scripts/hmmsearch.nf
+++ b/db-scripts/hmmsearch.nf
@@ -30,7 +30,7 @@ process hmmsearch {
     maxRetries 3
     cpus 1
     memory { 1.GB * task.attempt }
-    conda 'hmmer=3.3.2'
+    conda 'hmmer=3.4'
 
     input:
     path('input.faa') from chAAs
diff --git a/db-scripts/init-pscc.py b/db-scripts/init-pscc.py
index c8397230..d67a3a97 100644
--- a/db-scripts/init-pscc.py
+++ b/db-scripts/init-pscc.py
@@ -189,8 +189,8 @@ def is_taxon_child(child, LCA, taxonomy):
 log_pscc.debug('written UniParc seed sequences: %i', uniparc_total_seqs)
 
 print(f'PSCC normal seqs: {pscc_seqs}')
-log_pscc.debug('summary: # PSC normal=%i', pscc_seqs)
+log_pscc.debug('summary: # PSCC normal=%i', pscc_seqs)
 print(f'PSCC sORF seqs: {pscc_sorf_seqs}')
-log_pscc.debug('summary: # PSC sORFs=%i', pscc_sorf_seqs)
+log_pscc.debug('summary: # PSCC sORFs=%i', pscc_sorf_seqs)
 
 print("\nsuccessfully initialized PSCC table!")