From bb8f561e5d4dea9dd6dac7518aa8882d011b35e0 Mon Sep 17 00:00:00 2001 From: gbouras13 Date: Wed, 24 Jul 2024 13:43:13 +0930 Subject: [PATCH] update docs --- HISTORY.md | 6 ++++++ README.md | 14 +++++++++++--- docs/index.md | 4 ++-- docs/output.md | 2 +- docs/run.md | 42 ++++++++++++++++++++++++++++++++++++---- src/dnaapler/__init__.py | 2 +- 6 files changed, 59 insertions(+), 11 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 7c2522e..d3571f9 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,11 @@ # History +# 0.8.0 (2024-07-24) + +* Adds `dnaapler archaea` and adds archaeal reorientation functionality into `dnaapler all` +* Specifically, this uses 403 COG1474 genes [COG1474](https://www.ncbi.nlm.nih.gov/research/cog/cog/COG1474/) +* Relaxes (to warning) where no BLAST hits are found - pipleine will still complete (requested in a number of issues #74 #76 #77) + # 0.7.0 (2024-02-05) * Adds `-c/--custom_db` with `dnaapler all` to allow specifying custom databases with `dnaapler all`. diff --git a/README.md b/README.md index 36769db..ec3cb74 100644 --- a/README.md +++ b/README.md @@ -97,12 +97,13 @@ The full documentation for `dnaapler` can be found [here](https://dnaapler.readt ## Commands -* `dnaapler all`: Reorients 1 or more contigs to begin with any of dnaA, terL, repA. +* `dnaapler all`: Reorients 1 or more contigs to begin with any of dnaA, terL, repA or COG1474. - Practically, this should be the most useful command for most users. * `dnaapler chromosome`: Reorients your sequence to begin with the dnaA chromosomal replication initiator gene * `dnaapler plasmid`: Reorients your sequence to begin with the repA plasmid replication initiation gene * `dnaapler phage`: Reorients your sequence to begin with the terL large terminase subunit gene +* `dnaapler archaea`: Reorients your sequence to begin with the [COG1474 archaeal Orc1/cdc6 gene](https://www.ncbi.nlm.nih.gov/research/cog/cog/COG1474/). * `dnaapler custom`: Reorients your sequence to begin with a custom amino acid FASTA format gene that you specify * `dnaapler mystery`: Reorients your sequence to begin with a random CDS * `dnaapler largest`: Reorients your sequence to begin with the largest CDS @@ -146,6 +147,7 @@ Options: Commands: all Reorients contigs to begin with any of dnaA, repA... + archaea Reorients your genome to begin with the archaeal COG1474... bulk Reorients multiple genomes to begin with the same gene chromosome Reorients your genome to begin with the dnaA chromosomal... citation Print the citation(s) for this tool @@ -160,7 +162,7 @@ Commands: ``` Usage: dnaapler all [OPTIONS] - Reorients contigs to begin with any of dnaA, repA or terL + Reorients contigs to begin with any of dnaA, repA, terL or archaeal COG1474 Orc1/cdc6 Options: -h, --help Show this message and exit. @@ -202,6 +204,10 @@ dnaapler phage -i input.fasta -o output_directory_path -p my_phage_name -t 8 dnaapler plasmid -i input.fasta -o output_directory_path -p my_plasmid_name -t 8 ``` +``` +dnaapler archaea -i input.fasta -o output_directory_path -p my_archaea_name -t 8 +``` + ``` dnaapler custom -i input.fasta -o output_directory_path -p my_genome_name -t 8 -c my_custom_database_file ``` @@ -231,7 +237,9 @@ dnaapler bulk -i input_file_with_multiple_chromosomes.fasta -m chromosome -o out `dnaapler phage` uses a terL database curated using [PHROGs](https://phrogs.lmge.uca.fr). All the AA sequences of the 55 phrogs annotated as 'large terminase subunit' were downloaded, combined and depduplicated using [seqkit](https://github.com/shenwei356/seqkit) `seqkit rmdup -s -o terL.faa phrog_terL.faa`. -`dnaapler all` uses all three databases combined into one. +`dnaapler archaea` uses a database of 403 archaeal COG1474 Orc1/cdc6 genes curated from [here](https://ftp.ncbi.nlm.nih.gov/pub/wolf/COGs/arCOG/). + +`dnaapler all` uses all four databases combined into one. `dnaapler custom` uses a custom amino acid FASTA format file that you specify using `-c`. diff --git a/docs/index.md b/docs/index.md index f01647b..13044fc 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,8 +1,8 @@ `dnaapler` is a simple python program that takes a single nucleotide input sequence (in FASTA format), finds the desired start gene using `blastx` against an amino acid sequence database, checks that the start codon of this gene is found, and if so, then reorients the chromosome to begin with this gene on the forward strand. -It was originally designed to replicate the reorientation functionality of [Unicycler](https://github.com/rrwick/Unicycler/blob/main/unicycler/gene_data/repA.fasta) with dnaA, but for for long-read first assembled chromosomes. I have extended it to work with plasmids (`dnaapler plasmid`) and phages (`dnaapler phage`), or for any input FASTA desired with `dnaapler custom`,`dnaapler largest`, `dnaapler mystery` or `dnaapler nearest`. +It was originally designed to replicate the reorientation functionality of [Unicycler](https://github.com/rrwick/Unicycler/blob/main/unicycler/gene_data/repA.fasta) with dnaA, but for for long-read first assembled chromosomes. I have extended it to work with plasmids (`dnaapler plasmid`), phages (`dnaapler phage`) and archaea (`dnaapler archaea`), or for any input FASTA desired with `dnaapler custom`,`dnaapler largest`, `dnaapler mystery` or `dnaapler nearest`. -If your input FASTA is mixed and you have 1 or more contigs (e.g. has chromosome and plasmids), you can also use `dnaapler all`, with the option to ignore some contigs with the `--ignore` parameter. This is probably the most useful command for most users. +If your input FASTA is mixed and you have 1 or more contigs (e.g. has chromosome and plasmids), you should use `dnaapler all`, with the option to ignore some contigs with the `--ignore` parameter. This is probably the most useful command for most users. Additionally, you can also reorient multiple bacterial chromosomes/plasmids/phages at once using the `dnaapler bulk` subcommand - it will give you more information about what contigs couldn't be rotated which may be useful. diff --git a/docs/output.md b/docs/output.md index 9457e23..7985181 100644 --- a/docs/output.md +++ b/docs/output.md @@ -24,7 +24,7 @@ If you run `dnaapler all`, the output will be slightly different. There will sti * There will be a `{prefix}_all_reorientation_summary.tsv` summary file containing the reorientation information for each contig. -This summary file will be the same as for `bulk` as explained below, but with an extra column `Gene_Reoriented` that denotes which gene was detected in each contig (dnaA, repA or terL). +This summary file will be the same as for `bulk` as explained below, but with an extra column `Gene_Reoriented` that denotes which gene was detected in each contig (dnaA, repA, terL or archael COG1474). ### bulk diff --git a/docs/run.md b/docs/run.md index 091f403..e0970ab 100644 --- a/docs/run.md +++ b/docs/run.md @@ -11,7 +11,7 @@ You can use BLAST with multiple threads using the `-t` or `--threads` parameters `dnaapler` will not overwrite an output directory if it already exists by default. To force overwrite, please use `-f` or `--force`. -Finally, for the BLAST based subcommands (`chromosome`, `phage`, `plasmid`, `custom` or `all`), if no BLAST hit is found, by default `dnaapler` will error and exit. +Finally, for the BLAST based subcommands (`chromosome`, `phage`, `plasmid`, `archaea`, `custom` or `all`), if no BLAST hit is found, by default `dnaapler` will error and exit. However, you can decide to autocomplete `dnaapler` using the `-a` or `--autocomplete` parameters along with `mystery` or `nearest`, which will then run those subcommands to reorient your sequence. @@ -20,11 +20,15 @@ Also, a seed value using `--seed_value` can be specified with `dnaapler` to ensu ### all -`dnaapler all` is designed to simultaneously orient multiple contigs that can be a mix of chromosomes, plasmids and phages. It will also work on just 1 contig. +`dnaapler all` is designed to simultaneously orient multiple contigs that can be a mix of chromosomes, plasmids, archaea and phages. It will also work on just 1 contig. If a contig has BLAST hits for both dnaA and terL or repA, dnaA will be chosen for reorientation. -If a contig has BLAST hits for both terL and repA (but not dnaA), repA will be chosen for reorientation. +If a contig has BLAST hits for both archaeal COG1474 and terL or repA, COG1474 will be chosen for reorientation. + +If a contig has BLAST hits for both terL and repA (but not dnaA or COG1474), repA will be chosen for reorientation. + +If a contig has BLAST hits for both dnaA and archaeal COG1474, dnaA will be chosen for reorientation though I assume this would be very unlikely! You can also specify a text file with `--ignore` that lists all contigs (based on their header) to be ignored during reorientation. @@ -44,7 +48,8 @@ dnaapler all -i input.fasta -o output_directory_path -t 8 --ignore ignored_cont ``` Usage: dnaapler all [OPTIONS] - Reorients contigs to begin with any of dnaA, repA or terL + Reorients contigs to begin with any of dnaA, repA, terL or archaeal COG1474 + Orc1/cdc6 Options: -h, --help Show this message and exit. @@ -152,6 +157,35 @@ Options: 13] ``` +### archaea + +Example usage with no autocomplete command: + +``` +dnaapler archaea -i input.fasta -o output_directory_path -p my_archaea_name -t 8 +``` + +``` +Usage: dnaapler archaea [OPTIONS] + + Reorients your genome to begin with the archaeal COG1474 Orc1/cdc6 origin + recognition complex gene + +Options: + -h, --help Show this message and exit. + -V, --version Show the version and exit. + -i, --input PATH Path to input file in FASTA format [required] + -o, --output PATH Output directory [default: output.dnaapler] + -t, --threads INTEGER Number of threads to use with BLAST [default: 1] + -p, --prefix TEXT Prefix for output files [default: dnaapler] + -f, --force Force overwrites the output directory + -e, --evalue TEXT e value for blastx [default: 1e-10] + -a, --autocomplete TEXT Choose an option to autocomplete reorientation if + BLAST based approach fails. Must be one of: none, + mystery, largest, or nearest [default: none] + --seed_value INTEGER Random + ``` + ### custom To run `dnaapler custom`, you need to prefix an Amino Acid FASTA file containing the desired custom database gene using `-c` or `--custom_db`. diff --git a/src/dnaapler/__init__.py b/src/dnaapler/__init__.py index abf441e..7de1f4c 100755 --- a/src/dnaapler/__init__.py +++ b/src/dnaapler/__init__.py @@ -866,7 +866,7 @@ def all( custom_db, **kwargs, ): - """Reorients contigs to begin with any of dnaA, repA or terL""" + """Reorients contigs to begin with any of dnaA, repA, terL or archaeal COG1474 Orc1/cdc6""" # validates the directory (need to before I start dnaapler or else no log file is written) instantiate_dirs(output, force)