diff --git a/.gitignore b/.gitignore index ea86fd4..9255b0a 100644 --- a/.gitignore +++ b/.gitignore @@ -55,6 +55,8 @@ data/* .tags tags cscope.* +.vscode src/.tags src/tags src/cscope* +src/.vscode diff --git a/README.md b/README.md index 4f47baf..eb4c0d6 100644 --- a/README.md +++ b/README.md @@ -10,16 +10,16 @@ [![GitHub Downloads](https://img.shields.io/github/downloads/yangao07/TideHunter/total.svg?style=social&logo=github&label=Download)](https://github.com/yangao07/TideHunter/releases) --> -## Updates (v1.5.4) -* Fixed a bug related to msa (#6) -* Output single-copy full-length sequence when 5/3 adapters are provided +## Updates (v1.5.5) +* Output additional single-copy full-length sequence when 5/3 adapters are provided +* Copy number needs to be >= 2 for regular tandem repeats ## Getting started Download the [latest release](https://github.com/yangao07/TideHunter/releases): ``` -wget https://github.com/yangao07/TideHunter/releases/download/v1.5.4/TideHunter-v1.5.4.tar.gz -tar -zxvf TideHunter-v1.5.4.tar.gz && cd TideHunter-v1.5.4 +wget https://github.com/yangao07/TideHunter/releases/download/v1.5.5/TideHunter-v1.5.5.tar.gz +tar -zxvf TideHunter-v1.5.5.tar.gz && cd TideHunter-v1.5.5 ``` Make from source and run with test data: ``` @@ -32,28 +32,32 @@ TideHunter ./test_data/test_50x4.fa > cons.fa ``` ## Table of Contents -- [Introduction](#introduction) -- [Installation](#install) - - [Installing TideHunter via conda](#conda) - - [Building TideHunter from source files](#build) - - [Pre-built binary executable file for Linux/Unix](#binary) -- [Getting started with toy example in `test_data`](#start) -- [Usage](#usage) - - [To generate consensus sequences in FASTA format](#fasta_cons) - - [To generate consensus sequences in tabular format](#tab_cons) - - [To generate consensus sequences in FASTQ format](#fq_cons) - - [To generate full-length consensus sequences](#full_cons) - - [To generate unit sequences in FASTA format](#fasta_unit) - - [To generate unit sequences in tabular format](#tab_unit) -- [Commands and options](#cmd) -- [Input](#input) - - [Adapter sequence](#adapter) -- [Output](#output) - - [Tabular format](#tabular) - - [FASTA format](#fasta) - - [FASTQ format](#fastq) - - [Unit sequences](#unit) -- [Contact](#contact) +- [TideHunter: efficient and sensitive tandem repeat detection from noisy long reads using seed-and-chain](#tidehunter-efficient-and-sensitive-tandem-repeat-detection-from-noisy-long-reads-using-seed-and-chain) + - [Updates (v1.5.5)](#updates-v155) + - [Getting started](#getting-started) + - [Table of Contents](#table-of-contents) + - [Introduction](#introduction) + - [Installation](#installation) + - [Installing TideHunter via conda](#installing-tidehunter-via-conda) + - [Building TideHunter from source files](#building-tidehunter-from-source-files) + - [Pre-built binary executable file for Linux/Unix](#pre-built-binary-executable-file-for-linuxunix) + - [Getting started with toy example in `test_data`](#getting-started-with-toy-example-in-test_data) + - [Usage](#usage) + - [To generate consensus sequences in FASTA format](#to-generate-consensus-sequences-in-fasta-format) + - [To generate consensus sequences in tabular format](#to-generate-consensus-sequences-in-tabular-format) + - [To generate consensus sequences in FASTQ format](#to-generate-consensus-sequences-in-fastq-format) + - [To generate full-length consensus sequences](#to-generate-full-length-consensus-sequences) + - [To generate unit sequences in FASTA format](#to-generate-unit-sequences-in-fasta-format) + - [To generate unit sequences in tabular format](#to-generate-unit-sequences-in-tabular-format) + - [Commands and options](#commands-and-options) + - [Input](#input) + - [Adapter sequence](#adapter-sequence) + - [Output](#output) + - [Tabular format](#tabular-format) + - [FASTA format](#fasta-format) + - [FASTQ format](#fastq-format) + - [Unit sequences](#unit-sequences) + - [Contact](#contact) ## Introduction TideHunter is an efficient and sensitive tandem repeat detection and @@ -79,9 +83,9 @@ Make sure you have gcc (>=6.4.0) and zlib installed before compiling. It is recommended to download the latest release of TideHunter from the [release page](https://github.com/yangao07/TideHunter/releases). ``` -wget https://github.com/yangao07/TideHunter/releases/download/v1.5.4/TideHunter-v1.5.4.tar.gz -tar -zxvf TideHunter-v1.5.4.tar.gz -cd TideHunter-v1.5.4; make +wget https://github.com/yangao07/TideHunter/releases/download/v1.5.5/TideHunter-v1.5.5.tar.gz +tar -zxvf TideHunter-v1.5.5.tar.gz +cd TideHunter-v1.5.5; make ``` Or, you can use `git clone` command to download the source code. Don't forget to include the `--recursive` to download the codes of [abPOA](https://github.com/yangao07/abPOA). @@ -94,8 +98,8 @@ cd TideHunter; make ### Pre-built binary executable file for Linux/Unix If you meet any compiling issue, please try the pre-built binary file: ``` -wget https://github.com/yangao07/TideHunter/releases/download/v1.5.4/TideHunter-v1.5.4_x64-linux.tar.gz -tar -zxvf TideHunter-v1.5.4_x64-linux.tar.gz +wget https://github.com/yangao07/TideHunter/releases/download/v1.5.5/TideHunter-v1.5.5_x64-linux.tar.gz +tar -zxvf TideHunter-v1.5.5_x64-linux.tar.gz ``` ## Getting started with toy example in `test_data` @@ -132,13 +136,13 @@ TideHunter -u -f 2 ./test_data/test_1000x10.fa > unit.out ``` Usage: TideHunter [options] in.fa/fq > cons.fa -Options: +Options: Seeding: -k --kmer-length INT k-mer length (no larger than 16) [8] -w --window-size INT window size, set as >1 to enable minimizer seeding [1] -H --HPC-kmer use homopolymer-compressed k-mer [False] Tandem repeat criteria: - -c --min-copy INT minimum copy number of tandem repeat [2] + -c --min-copy INT minimum copy number of tandem repeat (>=2) [2] -e --max-diverg INT maximum allowed divergence rate between two consecutive repeats [0.25] -p --min-period INT minimum period size of tandem repeat (>=2) [30] -P --max-period INT maximum period size of tandem repeat (<=4294967295) [10K] @@ -163,12 +167,17 @@ Options: if r is integer: R = r -u --unit-seq only output unit sequences of each tandem repeat, no consensus sequence [False] -l --longest only output consensus sequence of tandem repeat that covers the longest read sequence [False] - -F --full-len only output full-length consensus sequence [False] + -F --full-len only output full-length consensus sequence. [False] + full-length: consensus sequence contains both 5' and 3' adapter sequence + *Note* only effective when -5 and -3 are provided. + -s --single-copy output additional single-copy full-length consensus sequence. [False] + *Note* only effective when -F is set and -5 and -3 are provided. -f --out-fmt INT output format [1] - 1: FASTA - 2: Tabular - 3: FASTQ - qualiy score of each base represents the ratio of the consensus coverage to the # total copies. + - 4: Tabular with quality score + for [3] and [4], qualiy score of each base represents the ratio of the consensus coverage to the # total copies. Computing resource: -t --thread INT number of threads to use [4] diff --git a/src/abpoa_cons.c b/src/abpoa_cons.c index eb7bf14..cc0b869 100644 --- a/src/abpoa_cons.c +++ b/src/abpoa_cons.c @@ -58,7 +58,7 @@ int abpoa_gen_cons(abpoa_t *ab, abpoa_para_t *abpt, uint8_t *bseqs, int seq_len, if (mtp->min_frac > 0.0) min_cov = (int)(n_seqs * mtp->min_frac); else if (mtp->min_cov > 0) min_cov = mtp->min_cov; if (n_seqs <= 2) { - if (n_seqs <= 1) err_fatal_simple("No enough sequences to perform msa.\n"); + if (n_seqs <= 1) err_fatal_simple("Not enough sequences to perform msa.\n"); cons_len = seq_lens[0]; int skip = 0; diff --git a/src/main.c b/src/main.c index 96be3f9..8c797a7 100644 --- a/src/main.c +++ b/src/main.c @@ -10,7 +10,7 @@ #include "kseq.h" const char PROG[20] = "TideHunter"; -const char VERSION[20] = "1.5.4"; +const char VERSION[20] = "1.5.5"; const char CONTACT[30] = "gaoy1@chop.edu"; const struct option mini_tandem_opt [] = { @@ -85,7 +85,7 @@ static int usage(void) err_printf(" Tandem repeat criteria:\n"); // TODO min_copy < 2 ??? - err_printf(" -c --min-copy INT minimum copy number of tandem repeat [%d]\n", MIN_COPY); + err_printf(" -c --min-copy INT minimum copy number of tandem repeat (>=%d) [%d]\n", MIN_COPY, MIN_COPY); err_printf(" -e --max-diverg INT maximum allowed divergence rate between two consecutive repeats [%.2f]\n", MAX_DIV); err_printf(" -p --min-period INT minimum period size of tandem repeat (>=%u) [%u]\n", MIN_PERIOD, DEF_MIN_PERIOD); err_printf(" -P --max-period INT maximum period size of tandem repeat (<=%u) [%s]\n", MAX_PERIOD, DEF_MAX_PERIOD_STR); @@ -118,7 +118,11 @@ static int usage(void) err_printf(" if \e[4mr\e[0m is integer: \e[4mR\e[0m = \e[4mr\e[0m\n"); err_printf(" -u --unit-seq only output unit sequences of each tandem repeat, no consensus sequence [False]\n"); err_printf(" -l --longest only output consensus sequence of tandem repeat that covers the longest read sequence [False]\n"); - err_printf(" -F --full-len only output full-length consensus sequence [False]\n"); + err_printf(" -F --full-len only output full-length consensus sequence. [False]\n"); + err_printf(" full-length: consensus sequence contains both 5' and 3' adapter sequence\n"); + err_printf(" *Note* only effective when -5 and -3 are provided.\n"); + err_printf(" -s --single-copy output additional single-copy full-length consensus sequence. [False]\n"); + err_printf(" *Note* only effective when -F is set and -5 and -3 are provided.\n"); err_printf(" -f --out-fmt INT output format [%d]\n", FASTA_FMT); err_printf(" - %d: FASTA\n", FASTA_FMT); err_printf(" - %d: Tabular\n", TAB_FMT); @@ -350,6 +354,7 @@ mini_tandem_para *mini_tandem_init_para(void) { mtp->only_unit = 0; mtp->only_longest = 0; mtp->only_full_length = 0; + mtp->single_copy = 0; mtp->out_fmt = FASTA_FMT; mtp->detail_fp = NULL; @@ -451,7 +456,12 @@ int main(int argc, char *argv[]) // case 's': mtp->s = atoi(optarg); break; case 'H': mtp->hpc = 1; break; - case 'c': mtp->min_copy = atoi(optarg); break; + case 'c': mtp->min_copy = atoi(optarg); + if (mtp->min_copy < MIN_COPY) { + err_printf("Error: -c --min-copy needs to be >= %d. (%d)\n", MIN_COPY, mtp->min_copy); + goto End; + } + break; case 'e': mtp->max_div = atof(optarg); break; case 'p': mtp->min_p = th_parse_num(optarg); if (mtp->min_p < MIN_PERIOD) { @@ -486,6 +496,7 @@ int main(int argc, char *argv[]) case 'u': mtp->only_unit = 1; break; case 'l': mtp->only_longest = 1; break; case 'F': mtp->only_full_length = 1; break; + case 's': mtp->single_copy = 1; break; case 'f': mtp->out_fmt = atoi(optarg); if (mtp->out_fmt != FASTA_FMT && mtp->out_fmt != TAB_FMT && mtp->out_fmt != FASTQ_FMT && mtp->out_fmt != TAB_QUAL_FMT) { err_printf("\n[main] Error: unknown format number. (-%c)\n", c); diff --git a/src/tidehunter.c b/src/tidehunter.c index 662b9b5..652a4e5 100644 --- a/src/tidehunter.c +++ b/src/tidehunter.c @@ -46,7 +46,7 @@ int tidehunter_core(kseq_t *read_seq, tandem_seq_t *tseq, mini_tandem_para *mtp, } free(bseq); - if (tseq->cons_n == 0 && mtp->only_full_length && mtp->five_seq != NULL && mtp->three_seq != NULL) { // for 1-copy full-length seq + if (mtp->single_copy == 1 && mtp->only_full_length && mtp->five_seq != NULL && mtp->three_seq != NULL) { // for 1-copy full-length seq single_copy_full_len_seq(seq_len, seq, tseq, mtp); } diff --git a/src/tidehunter.h b/src/tidehunter.h index a4c608c..d3668a2 100644 --- a/src/tidehunter.h +++ b/src/tidehunter.h @@ -55,7 +55,7 @@ typedef struct { float ada_match_rat; char *five_fn, *five_seq, *five_rc_seq; int five_len; char *three_fn, *three_seq, *three_rc_seq; int three_len; - int out_fmt, min_len, only_unit, only_longest, only_full_length; // only output the cons that spans the longest sequence + int out_fmt, min_len, only_unit, only_longest, only_full_length, single_copy; // only output the cons that spans the longest sequence FILE *cons_out, *detail_fp; //char detail_out[1024]; int n_thread; } mini_tandem_para;