From e7f00df0ad3b4869a5d3cd562a2a15e887aae0eb Mon Sep 17 00:00:00 2001 From: Oliver Schwengers Date: Thu, 18 Jan 2024 09:46:28 +0100 Subject: [PATCH] Update db build scripts (#270) * update MOB-suite to v3.1.8 * update UniParc file download * bump Diamond to v2.1.8 * bump HMMER to v3.4 * URL updates and minor fixes * add EC & GO terms and update to new NCBIFams file format --- db-scripts/annotate-ncbi-fams.py | 83 ++++++++++++++++++++++++++++---- db-scripts/buid-db.sh | 18 ++++--- db-scripts/diamond.nf | 2 +- db-scripts/hmmsearch.nf | 2 +- db-scripts/init-pscc.py | 4 +- 5 files changed, 89 insertions(+), 20 deletions(-) diff --git a/db-scripts/annotate-ncbi-fams.py b/db-scripts/annotate-ncbi-fams.py index d6d3e481..72196e1b 100644 --- a/db-scripts/annotate-ncbi-fams.py +++ b/db-scripts/annotate-ncbi-fams.py @@ -40,15 +40,42 @@ with hmms_path.open() as fh, alive_bar() as bar: for line in fh: ( - ncbi_accession, source_identifier, label, sequence_cutoff, domain_cutoff, hmm_length, family_type, for_structural_annotation, for_naming, for_AMRFinder, product_name, gene_symbol, ec_numbers, go_terms, pmids, taxonomic_range, taxonomic_range_name, taxonomic_rank_name, n_refseq_protein_hits, source, name_orig + ncbi_accession, + source_identifier, + label, + sequence_cutoff, + domain_cutoff, + hmm_length, + family_type, + for_structural_annotation, + for_naming, + for_AMRFinder, + product_name, + gene_symbol, + ec_numbers, + go_terms, + pmids, + taxonomic_range, + taxonomic_range_name, + taxonomic_rank_name, + n_refseq_protein_hits, + source, + name_orig, + _1, + _2 ) = line.split('\t') if(family_type in family_type_ranks.keys() and for_naming == 'Y'): # only accept exception/equivalog HMMs eligible for naming proteins hmm = { 'acc': ncbi_accession, - 'gene': gene_symbol, 'type': family_type, 'product': product_name.strip() } + if(gene_symbol != ''): + hmm['gene'] = gene_symbol + if(ec_numbers != ''): + hmm['ecs'] = ec_numbers.split(',') + if(go_terms != ''): + hmm['gos'] = [term.split(':')[1] for term in go_terms.split(',')] if(ncbi_accession != '-'): hmms[ncbi_accession] = hmm bar() @@ -86,6 +113,9 @@ print('\n') psc_annotated = 0 +psc_with_gene = 0 +psc_with_ec = 0 +psc_with_go = 0 with sqlite3.connect(str(db_path), isolation_level='EXCLUSIVE') as conn, alive_bar(total=len(hit_per_psc)) as bar: conn.execute('PRAGMA page_size = 4096;') conn.execute('PRAGMA cache_size = 100000;') @@ -95,16 +125,51 @@ conn.execute('PRAGMA journal_mode = OFF') conn.execute('PRAGMA threads = 2;') conn.commit() + conn.row_factory = sqlite3.Row for psc_id, hit in hit_per_psc.items(): hmm = hmms.get(hit['hmm_id'], None) if(hmm is not None): - if(hmm['gene'] == ''): - conn.execute('UPDATE psc SET product=? WHERE uniref90_id=?', (hmm['product'], psc_id)) # annotate PSC - log_psc.info('UPDATE psc SET product=%s WHERE uniref90_id=%s', hmm['product'], psc_id) - else: - conn.execute('UPDATE psc SET gene=?, product=? WHERE uniref90_id=?', (hmm['gene'], hmm['product'], psc_id)) # annotate PSC - log_psc.info('UPDATE psc SET gene=%s, product=%s WHERE uniref90_id=%s', hmm['gene'], hmm['product'], psc_id) + conn.execute('UPDATE psc SET product=? WHERE uniref90_id=?', (hmm['product'], psc_id)) # annotate PSC + log_psc.info('UPDATE psc SET product=%s WHERE uniref90_id=%s', hmm['product'], psc_id) + if('gene' in hmm): + conn.execute('UPDATE psc SET gene=? WHERE uniref90_id=?', (hmm['gene'], psc_id)) # annotate PSC + log_psc.info('UPDATE psc SET gene=%s WHERE uniref90_id=%s', hmm['gene'], psc_id) + psc_with_gene += 1 + if('ecs' in hmm): + rec_psc = conn.execute('SELECT ec_ids FROM psc WHERE uniref90_id=?', (psc_id,)).fetchone() + if(rec_psc is None): + continue + if(rec_psc['ec_ids'] is not None): + ecs = set(rec_psc['ec_ids'].split(',')) + for ec in hmm['ecs']: + ecs.add(ec) + else: + ecs = hmm['ecs'] + ec_list = ','.join(sorted(ecs)) + conn.execute('UPDATE psc SET ec_ids=? WHERE uniref90_id=?', (ec_list, psc_id)) # annotate PSC + log_psc.info('UPDATE psc SET ec_ids=%s WHERE uniref90_id=%s', ec_list, psc_id) + psc_with_ec += 1 + if('gos' in hmm): + rec_psc = conn.execute('SELECT go_ids FROM psc WHERE uniref90_id=?', (psc_id,)).fetchone() + if(rec_psc is None): + continue + if(rec_psc['go_ids'] is not None): + gos = set(rec_psc['go_ids'].split(',')) + for go in hmm['gos']: + gos.add(go) + else: + gos = hmm['gos'] + go_list = ','.join(sorted(gos)) + conn.execute('UPDATE psc SET go_ids=? WHERE uniref90_id=?', (go_list, psc_id)) # annotate PSC + log_psc.info('UPDATE psc SET go_ids=%s WHERE uniref90_id=%s', go_list, psc_id) + psc_with_go += 1 psc_annotated += 1 bar() -print(f'PSCs with annotated gene / product: {psc_annotated}') +print(f'PSCs with annotated product: {psc_annotated}') log_psc.debug('summary: PSC annotated=%d', psc_annotated) +print(f'PSCs with annotated gene: {psc_with_gene}') +log_psc.debug('summary: PSC gene annotated=%d', psc_with_gene) +print(f'PSCs with annotated EC: {psc_with_ec}') +log_psc.debug('summary: PSC EC annotated=%d', psc_with_ec) +print(f'PSCs with annotated GO: {psc_with_go}') +log_psc.debug('summary: PSC GO annotated=%d', psc_with_go) diff --git a/db-scripts/buid-db.sh b/db-scripts/buid-db.sh index f2cc35af..812ea653 100644 --- a/db-scripts/buid-db.sh +++ b/db-scripts/buid-db.sh @@ -62,7 +62,7 @@ rm -r antifam antifam-dir/ # download & extract oriT sequences printf "\n5/20: download and extract oriT sequences from Mob-suite ...\n" -wget https://zenodo.org/record/3786915/files/data.tar.gz +wget https://zenodo.org/records/10304948/files/data.tar.gz tar -xvzf data.tar.gz mv data/orit.fas ./orit.fna rm -r data/ data.tar.gz @@ -109,9 +109,13 @@ python3 ${BAKTA_DB_SCRIPTS}/init-db.py --db bakta.db ############################################################################ printf "\n8/20: download UniProt UniRef50 ...\n" wget https://ftp.expasy.org/databases/uniprot/current_release/uniref/uniref50/uniref50.xml.gz -wget https://ftp.expasy.org/databases/uniprot/current_release/uniparc/uniparc_active.fasta.gz +for i in {1..200}; do + wget https://ftp.expasy.org/databases/uniprot/current_release/uniparc/fasta/active/uniparc_active_p${i}.fasta.gz + pigz -dc uniparc_active_p${i}.fasta.gz >> uniparc_active.fasta + rm uniparc_active_p${i}.fasta.gz +done printf "\n8/20: read UniRef90 entries and build Protein Sequence Cluster sequence and information databases:\n" -python3 ${BAKTA_DB_SCRIPTS}/init-pscc.py --taxonomy nodes.dmp --uniref50 uniref50.xml.gz --uniparc uniparc_active.fasta.gz --db bakta.db --pscc pscc.faa --sorf pscc_sorf.faa +python3 ${BAKTA_DB_SCRIPTS}/init-pscc.py --taxonomy nodes.dmp --uniref50 uniref50.xml.gz --uniparc uniparc_active.fasta --db bakta.db --pscc pscc.faa --pscc_sorf pscc_sorf.faa printf "\n8/20: build PSCC Diamond db ...\n" diamond makedb --in pscc.faa --db pscc diamond makedb --in pscc_sorf.faa --db sorf @@ -133,7 +137,7 @@ rm uniref50.xml.gz printf "\n9/20: download UniProt UniRef90 ...\n" wget https://ftp.expasy.org/databases/uniprot/current_release/uniref/uniref90/uniref90.xml.gz printf "\n9/20: read UniRef90 entries and build Protein Sequence Cluster sequence and information databases:\n" -python3 ${BAKTA_DB_SCRIPTS}/init-psc.py --taxonomy nodes.dmp --uniref90 uniref90.xml.gz --uniparc uniparc_active.fasta.gz --db bakta.db --psc psc.faa --sorf sorf.faa +python3 ${BAKTA_DB_SCRIPTS}/init-psc.py --taxonomy nodes.dmp --uniref90 uniref90.xml.gz --uniparc uniparc_active.fasta --db bakta.db --psc psc.faa --psc_sorf sorf.faa printf "\n9/20: build PSC Diamond db ...\n" diamond makedb --in psc.faa --db psc diamond makedb --in sorf.faa --db sorf @@ -148,7 +152,7 @@ rm uniref90.xml.gz printf "\n10/20: download UniProt UniRef100 ...\n" wget https://ftp.expasy.org/databases/uniprot/current_release/uniref/uniref100/uniref100.xml.gz printf "\n10/20: read, filter and store UniRef100 entries ...:\n" -python3 ${BAKTA_DB_SCRIPTS}/init-ups-ips.py --taxonomy nodes.dmp --uniref100 uniref100.xml.gz --uniparc uniparc_active.fasta.gz --db bakta.db --ips ips.faa +python3 ${BAKTA_DB_SCRIPTS}/init-ups-ips.py --taxonomy nodes.dmp --uniref100 uniref100.xml.gz --uniparc uniparc_active.fasta --db bakta.db --ips ips.faa rm uniref100.xml.gz uniparc_active.fasta.gz @@ -224,7 +228,7 @@ rm refseq-bacteria-nrp.trimmed.faa PCLA_proteins.txt PCLA_clusters.txt # - annotate IPSs if IPS have no PSC UniRef90 identifier (seq -> hash -> UPS -> IPS) ############################################################################ printf "\n14/20: download UniProt/SwissProt ...\n" -wget https://ftp.expasy.org/databases/swiss-prot/release/uniprot_sprot.xml.gz +wget https://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz printf "\n14/20: annotate IPSs and PSCs ...\n" python3 ${BAKTA_DB_SCRIPTS}/annotate-swissprot.py --taxonomy nodes.dmp --xml uniprot_sprot.xml.gz --db bakta.db rm uniprot_sprot.xml.gz @@ -241,7 +245,7 @@ wget https://ftp.ncbi.nlm.nih.gov/hmm/current/hmm_PGAP.tsv grep -v "(Provisional)" hmm_PGAP.tsv > hmms.non-prov.tsv grep exception hmms.non-prov.tsv > hmms.ncbi.selected.tsv grep equivalog hmms.non-prov.tsv >> hmms.ncbi.selected.tsv -cut -f1 hmms.ncbi.selected.tsv > hmms.ids.txt +sort hmms.ncbi.selected.tsv | uniq | cut -f1 > hmms.ids.txt hmmfetch -f -o ncbifams hmm_PGAP.LIB hmms.ids.txt hmmpress ncbifams printf "\n15/20: annotate PSCs...\n" diff --git a/db-scripts/diamond.nf b/db-scripts/diamond.nf index 41865809..8bf8fbf1 100644 --- a/db-scripts/diamond.nf +++ b/db-scripts/diamond.nf @@ -27,7 +27,7 @@ process diamond { maxRetries 3 cpus 8 memory '32 GB' - conda 'diamond=2.0.14' + conda 'diamond=2.1.8' input: file('input.faa') from chAAs diff --git a/db-scripts/hmmsearch.nf b/db-scripts/hmmsearch.nf index 638f9197..ba5af8ec 100644 --- a/db-scripts/hmmsearch.nf +++ b/db-scripts/hmmsearch.nf @@ -30,7 +30,7 @@ process hmmsearch { maxRetries 3 cpus 1 memory { 1.GB * task.attempt } - conda 'hmmer=3.3.2' + conda 'hmmer=3.4' input: path('input.faa') from chAAs diff --git a/db-scripts/init-pscc.py b/db-scripts/init-pscc.py index c8397230..d67a3a97 100644 --- a/db-scripts/init-pscc.py +++ b/db-scripts/init-pscc.py @@ -189,8 +189,8 @@ def is_taxon_child(child, LCA, taxonomy): log_pscc.debug('written UniParc seed sequences: %i', uniparc_total_seqs) print(f'PSCC normal seqs: {pscc_seqs}') -log_pscc.debug('summary: # PSC normal=%i', pscc_seqs) +log_pscc.debug('summary: # PSCC normal=%i', pscc_seqs) print(f'PSCC sORF seqs: {pscc_sorf_seqs}') -log_pscc.debug('summary: # PSC sORFs=%i', pscc_sorf_seqs) +log_pscc.debug('summary: # PSCC sORFs=%i', pscc_sorf_seqs) print("\nsuccessfully initialized PSCC table!")