diff --git a/.gitignore b/.gitignore
index ea86fd4..9255b0a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -55,6 +55,8 @@ data/*
.tags
tags
cscope.*
+.vscode
src/.tags
src/tags
src/cscope*
+src/.vscode
diff --git a/README.md b/README.md
index 4f47baf..eb4c0d6 100644
--- a/README.md
+++ b/README.md
@@ -10,16 +10,16 @@
[![GitHub Downloads](https://img.shields.io/github/downloads/yangao07/TideHunter/total.svg?style=social&logo=github&label=Download)](https://github.com/yangao07/TideHunter/releases)
-->
-## Updates (v1.5.4)
-* Fixed a bug related to msa (#6)
-* Output single-copy full-length sequence when 5/3 adapters are provided
+## Updates (v1.5.5)
+* Output additional single-copy full-length sequence when 5/3 adapters are provided
+* Copy number needs to be >= 2 for regular tandem repeats
## Getting started
Download the [latest release](https://github.com/yangao07/TideHunter/releases):
```
-wget https://github.com/yangao07/TideHunter/releases/download/v1.5.4/TideHunter-v1.5.4.tar.gz
-tar -zxvf TideHunter-v1.5.4.tar.gz && cd TideHunter-v1.5.4
+wget https://github.com/yangao07/TideHunter/releases/download/v1.5.5/TideHunter-v1.5.5.tar.gz
+tar -zxvf TideHunter-v1.5.5.tar.gz && cd TideHunter-v1.5.5
```
Make from source and run with test data:
```
@@ -32,28 +32,32 @@ TideHunter ./test_data/test_50x4.fa > cons.fa
```
## Table of Contents
-- [Introduction](#introduction)
-- [Installation](#install)
- - [Installing TideHunter via conda](#conda)
- - [Building TideHunter from source files](#build)
- - [Pre-built binary executable file for Linux/Unix](#binary)
-- [Getting started with toy example in `test_data`](#start)
-- [Usage](#usage)
- - [To generate consensus sequences in FASTA format](#fasta_cons)
- - [To generate consensus sequences in tabular format](#tab_cons)
- - [To generate consensus sequences in FASTQ format](#fq_cons)
- - [To generate full-length consensus sequences](#full_cons)
- - [To generate unit sequences in FASTA format](#fasta_unit)
- - [To generate unit sequences in tabular format](#tab_unit)
-- [Commands and options](#cmd)
-- [Input](#input)
- - [Adapter sequence](#adapter)
-- [Output](#output)
- - [Tabular format](#tabular)
- - [FASTA format](#fasta)
- - [FASTQ format](#fastq)
- - [Unit sequences](#unit)
-- [Contact](#contact)
+- [TideHunter: efficient and sensitive tandem repeat detection from noisy long reads using seed-and-chain](#tidehunter-efficient-and-sensitive-tandem-repeat-detection-from-noisy-long-reads-using-seed-and-chain)
+ - [Updates (v1.5.5)](#updates-v155)
+ - [Getting started](#getting-started)
+ - [Table of Contents](#table-of-contents)
+ - [Introduction](#introduction)
+ - [Installation](#installation)
+ - [Installing TideHunter via conda](#installing-tidehunter-via-conda)
+ - [Building TideHunter from source files](#building-tidehunter-from-source-files)
+ - [Pre-built binary executable file for Linux/Unix](#pre-built-binary-executable-file-for-linuxunix)
+ - [Getting started with toy example in `test_data`](#getting-started-with-toy-example-in-test_data)
+ - [Usage](#usage)
+ - [To generate consensus sequences in FASTA format](#to-generate-consensus-sequences-in-fasta-format)
+ - [To generate consensus sequences in tabular format](#to-generate-consensus-sequences-in-tabular-format)
+ - [To generate consensus sequences in FASTQ format](#to-generate-consensus-sequences-in-fastq-format)
+ - [To generate full-length consensus sequences](#to-generate-full-length-consensus-sequences)
+ - [To generate unit sequences in FASTA format](#to-generate-unit-sequences-in-fasta-format)
+ - [To generate unit sequences in tabular format](#to-generate-unit-sequences-in-tabular-format)
+ - [Commands and options](#commands-and-options)
+ - [Input](#input)
+ - [Adapter sequence](#adapter-sequence)
+ - [Output](#output)
+ - [Tabular format](#tabular-format)
+ - [FASTA format](#fasta-format)
+ - [FASTQ format](#fastq-format)
+ - [Unit sequences](#unit-sequences)
+ - [Contact](#contact)
## Introduction
TideHunter is an efficient and sensitive tandem repeat detection and
@@ -79,9 +83,9 @@ Make sure you have gcc (>=6.4.0) and zlib installed before compiling.
It is recommended to download the latest release of TideHunter
from the [release page](https://github.com/yangao07/TideHunter/releases).
```
-wget https://github.com/yangao07/TideHunter/releases/download/v1.5.4/TideHunter-v1.5.4.tar.gz
-tar -zxvf TideHunter-v1.5.4.tar.gz
-cd TideHunter-v1.5.4; make
+wget https://github.com/yangao07/TideHunter/releases/download/v1.5.5/TideHunter-v1.5.5.tar.gz
+tar -zxvf TideHunter-v1.5.5.tar.gz
+cd TideHunter-v1.5.5; make
```
Or, you can use `git clone` command to download the source code.
Don't forget to include the `--recursive` to download the codes of [abPOA](https://github.com/yangao07/abPOA).
@@ -94,8 +98,8 @@ cd TideHunter; make
### Pre-built binary executable file for Linux/Unix
If you meet any compiling issue, please try the pre-built binary file:
```
-wget https://github.com/yangao07/TideHunter/releases/download/v1.5.4/TideHunter-v1.5.4_x64-linux.tar.gz
-tar -zxvf TideHunter-v1.5.4_x64-linux.tar.gz
+wget https://github.com/yangao07/TideHunter/releases/download/v1.5.5/TideHunter-v1.5.5_x64-linux.tar.gz
+tar -zxvf TideHunter-v1.5.5_x64-linux.tar.gz
```
## Getting started with toy example in `test_data`
@@ -132,13 +136,13 @@ TideHunter -u -f 2 ./test_data/test_1000x10.fa > unit.out
```
Usage: TideHunter [options] in.fa/fq > cons.fa
-Options:
+Options:
Seeding:
-k --kmer-length INT k-mer length (no larger than 16) [8]
-w --window-size INT window size, set as >1 to enable minimizer seeding [1]
-H --HPC-kmer use homopolymer-compressed k-mer [False]
Tandem repeat criteria:
- -c --min-copy INT minimum copy number of tandem repeat [2]
+ -c --min-copy INT minimum copy number of tandem repeat (>=2) [2]
-e --max-diverg INT maximum allowed divergence rate between two consecutive repeats [0.25]
-p --min-period INT minimum period size of tandem repeat (>=2) [30]
-P --max-period INT maximum period size of tandem repeat (<=4294967295) [10K]
@@ -163,12 +167,17 @@ Options:
if r is integer: R = r
-u --unit-seq only output unit sequences of each tandem repeat, no consensus sequence [False]
-l --longest only output consensus sequence of tandem repeat that covers the longest read sequence [False]
- -F --full-len only output full-length consensus sequence [False]
+ -F --full-len only output full-length consensus sequence. [False]
+ full-length: consensus sequence contains both 5' and 3' adapter sequence
+ *Note* only effective when -5 and -3 are provided.
+ -s --single-copy output additional single-copy full-length consensus sequence. [False]
+ *Note* only effective when -F is set and -5 and -3 are provided.
-f --out-fmt INT output format [1]
- 1: FASTA
- 2: Tabular
- 3: FASTQ
- qualiy score of each base represents the ratio of the consensus coverage to the # total copies.
+ - 4: Tabular with quality score
+ for [3] and [4], qualiy score of each base represents the ratio of the consensus coverage to the # total copies.
Computing resource:
-t --thread INT number of threads to use [4]
diff --git a/src/abpoa_cons.c b/src/abpoa_cons.c
index eb7bf14..cc0b869 100644
--- a/src/abpoa_cons.c
+++ b/src/abpoa_cons.c
@@ -58,7 +58,7 @@ int abpoa_gen_cons(abpoa_t *ab, abpoa_para_t *abpt, uint8_t *bseqs, int seq_len,
if (mtp->min_frac > 0.0) min_cov = (int)(n_seqs * mtp->min_frac);
else if (mtp->min_cov > 0) min_cov = mtp->min_cov;
if (n_seqs <= 2) {
- if (n_seqs <= 1) err_fatal_simple("No enough sequences to perform msa.\n");
+ if (n_seqs <= 1) err_fatal_simple("Not enough sequences to perform msa.\n");
cons_len = seq_lens[0];
int skip = 0;
diff --git a/src/main.c b/src/main.c
index 96be3f9..8c797a7 100644
--- a/src/main.c
+++ b/src/main.c
@@ -10,7 +10,7 @@
#include "kseq.h"
const char PROG[20] = "TideHunter";
-const char VERSION[20] = "1.5.4";
+const char VERSION[20] = "1.5.5";
const char CONTACT[30] = "gaoy1@chop.edu";
const struct option mini_tandem_opt [] = {
@@ -85,7 +85,7 @@ static int usage(void)
err_printf(" Tandem repeat criteria:\n");
// TODO min_copy < 2 ???
- err_printf(" -c --min-copy INT minimum copy number of tandem repeat [%d]\n", MIN_COPY);
+ err_printf(" -c --min-copy INT minimum copy number of tandem repeat (>=%d) [%d]\n", MIN_COPY, MIN_COPY);
err_printf(" -e --max-diverg INT maximum allowed divergence rate between two consecutive repeats [%.2f]\n", MAX_DIV);
err_printf(" -p --min-period INT minimum period size of tandem repeat (>=%u) [%u]\n", MIN_PERIOD, DEF_MIN_PERIOD);
err_printf(" -P --max-period INT maximum period size of tandem repeat (<=%u) [%s]\n", MAX_PERIOD, DEF_MAX_PERIOD_STR);
@@ -118,7 +118,11 @@ static int usage(void)
err_printf(" if \e[4mr\e[0m is integer: \e[4mR\e[0m = \e[4mr\e[0m\n");
err_printf(" -u --unit-seq only output unit sequences of each tandem repeat, no consensus sequence [False]\n");
err_printf(" -l --longest only output consensus sequence of tandem repeat that covers the longest read sequence [False]\n");
- err_printf(" -F --full-len only output full-length consensus sequence [False]\n");
+ err_printf(" -F --full-len only output full-length consensus sequence. [False]\n");
+ err_printf(" full-length: consensus sequence contains both 5' and 3' adapter sequence\n");
+ err_printf(" *Note* only effective when -5 and -3 are provided.\n");
+ err_printf(" -s --single-copy output additional single-copy full-length consensus sequence. [False]\n");
+ err_printf(" *Note* only effective when -F is set and -5 and -3 are provided.\n");
err_printf(" -f --out-fmt INT output format [%d]\n", FASTA_FMT);
err_printf(" - %d: FASTA\n", FASTA_FMT);
err_printf(" - %d: Tabular\n", TAB_FMT);
@@ -350,6 +354,7 @@ mini_tandem_para *mini_tandem_init_para(void) {
mtp->only_unit = 0;
mtp->only_longest = 0;
mtp->only_full_length = 0;
+ mtp->single_copy = 0;
mtp->out_fmt = FASTA_FMT;
mtp->detail_fp = NULL;
@@ -451,7 +456,12 @@ int main(int argc, char *argv[])
// case 's': mtp->s = atoi(optarg); break;
case 'H': mtp->hpc = 1; break;
- case 'c': mtp->min_copy = atoi(optarg); break;
+ case 'c': mtp->min_copy = atoi(optarg);
+ if (mtp->min_copy < MIN_COPY) {
+ err_printf("Error: -c --min-copy needs to be >= %d. (%d)\n", MIN_COPY, mtp->min_copy);
+ goto End;
+ }
+ break;
case 'e': mtp->max_div = atof(optarg); break;
case 'p': mtp->min_p = th_parse_num(optarg);
if (mtp->min_p < MIN_PERIOD) {
@@ -486,6 +496,7 @@ int main(int argc, char *argv[])
case 'u': mtp->only_unit = 1; break;
case 'l': mtp->only_longest = 1; break;
case 'F': mtp->only_full_length = 1; break;
+ case 's': mtp->single_copy = 1; break;
case 'f': mtp->out_fmt = atoi(optarg);
if (mtp->out_fmt != FASTA_FMT && mtp->out_fmt != TAB_FMT && mtp->out_fmt != FASTQ_FMT && mtp->out_fmt != TAB_QUAL_FMT) {
err_printf("\n[main] Error: unknown format number. (-%c)\n", c);
diff --git a/src/tidehunter.c b/src/tidehunter.c
index 662b9b5..652a4e5 100644
--- a/src/tidehunter.c
+++ b/src/tidehunter.c
@@ -46,7 +46,7 @@ int tidehunter_core(kseq_t *read_seq, tandem_seq_t *tseq, mini_tandem_para *mtp,
}
free(bseq);
- if (tseq->cons_n == 0 && mtp->only_full_length && mtp->five_seq != NULL && mtp->three_seq != NULL) { // for 1-copy full-length seq
+ if (mtp->single_copy == 1 && mtp->only_full_length && mtp->five_seq != NULL && mtp->three_seq != NULL) { // for 1-copy full-length seq
single_copy_full_len_seq(seq_len, seq, tseq, mtp);
}
diff --git a/src/tidehunter.h b/src/tidehunter.h
index a4c608c..d3668a2 100644
--- a/src/tidehunter.h
+++ b/src/tidehunter.h
@@ -55,7 +55,7 @@ typedef struct {
float ada_match_rat;
char *five_fn, *five_seq, *five_rc_seq; int five_len;
char *three_fn, *three_seq, *three_rc_seq; int three_len;
- int out_fmt, min_len, only_unit, only_longest, only_full_length; // only output the cons that spans the longest sequence
+ int out_fmt, min_len, only_unit, only_longest, only_full_length, single_copy; // only output the cons that spans the longest sequence
FILE *cons_out, *detail_fp; //char detail_out[1024];
int n_thread;
} mini_tandem_para;