Skip to content

Commit

Permalink
stats: add AvgQual. #411
Browse files Browse the repository at this point in the history
  • Loading branch information
shenwei356 committed Sep 29, 2023
1 parent df9ba7d commit 19c5a65
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 22 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
- add a new flag `-R/--region-coord` for appending coordinates to sequence ID for `-r/--region`. [#413](https://github.com/shenwei356/seqkit/issues/413)
- `seqkit seq`:
- change the nucleotide color theme. [#412](https://github.com/shenwei356/seqkit/pull/412)
- `seqkit stat`:
- add a new column `AvgQual` for average quality score. [#411](https://github.com/shenwei356/seqkit/issues/411)
- [SeqKit v2.5.1](https://github.com/shenwei356/seqkit/releases/tag/v2.5.1) - 2023-08-09
[![Github Releases (by Release)](https://img.shields.io/github/downloads/shenwei356/seqkit/v2.5.1/total.svg)](https://github.com/shenwei356/seqkit/releases/tag/v2.5.1)
- `seqkit stats`:
Expand Down
17 changes: 9 additions & 8 deletions doc/docs/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -677,7 +677,8 @@ Columns:
13. N50 N50. https://en.wikipedia.org/wiki/N50,_L50,_and_related_statistics#N50
14. Q20(%) percentage of bases with the quality score greater than 20
15. Q30(%) percentage of bases with the quality score greater than 30
16. GC(%) percentage of GC content
16. AvgQual average quality
17. GC(%) percentage of GC content
Attentions:
1. Sequence length metrics (sum_len, min_len, avg_len, max_len, Q1, Q2, Q3)
Expand Down Expand Up @@ -767,13 +768,13 @@ Eexamples
1. Extra information

$ seqkit stats *.f{a,q}.gz -a
file format type num_seqs sum_len min_len avg_len max_len Q1 Q2 Q3 sum_gap N50 Q20(%) Q30(%) GC(%)
hairpin.fa.gz FASTA RNA 28,645 2,949,871 39 103 2,354 76 91 111 0 101 0 0 45.77
mature.fa.gz FASTA RNA 35,828 781,222 15 21.8 34 21 22 22 0 22 0 0 47.6
Illimina1.8.fq.gz FASTQ DNA 10,000 1,500,000 150 150 150 75 150 75 0 150 96.16 89.71 49.91
nanopore.fq.gz FASTQ DNA 4,000 1,798,723 153 449.7 6,006 271 318 391 0 395 40.79 12.63 46.66
reads_1.fq.gz FASTQ DNA 2,500 567,516 226 227 229 227 227 227 0 227 91.24 86.62 53.63
reads_2.fq.gz FASTQ DNA 2,500 560,002 223 224 225 224 224 224 0 224 91.06 87.66 54.77
file format type num_seqs sum_len min_len avg_len max_len Q1 Q2 Q3 sum_gap N50 Q20(%) Q30(%) AvgQual GC(%)
hairpin.fa.gz FASTA RNA 28,645 2,949,871 39 103 2,354 76 91 111 0 101 0 0 0 45.77
mature.fa.gz FASTA RNA 35,828 781,222 15 21.8 34 21 22 22 0 22 0 0 0 47.6
Illimina1.8.fq.gz FASTQ DNA 10,000 1,500,000 150 150 150 150 150 150 0 150 96.16 89.71 24.82 49.91
nanopore.fq.gz FASTQ DNA 4,000 1,798,723 153 449.7 6,006 271 318 391 0 395 40.79 12.63 9.48 46.66
reads_1.fq.gz FASTQ DNA 2,500 567,516 226 227 229 227 227 227 0 227 91.24 86.62 15.45 53.63
reads_2.fq.gz FASTQ DNA 2,500 560,002 223 224 225 224 224 224 0 224 91.06 87.66 14.62 54.77

1. **Parallelize counting files, it's much faster for lots of small files, especially for files on SSD**

Expand Down
50 changes: 36 additions & 14 deletions seqkit/cmd/stat.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ package cmd
import (
"fmt"
"io"
"math"
"os"
"path/filepath"
"runtime"
Expand All @@ -39,7 +40,7 @@ import (
"github.com/shenwei356/bio/util"
"github.com/shenwei356/stable"
"github.com/shenwei356/util/byteutil"
"github.com/shenwei356/util/math"
mathutil "github.com/shenwei356/util/math"
"github.com/shenwei356/xopen"
"github.com/spf13/cobra"
"github.com/vbauerster/mpb/v5"
Expand Down Expand Up @@ -70,7 +71,8 @@ Columns:
13. N50 N50. https://en.wikipedia.org/wiki/N50,_L50,_and_related_statistics#N50
14. Q20(%) percentage of bases with the quality score greater than 20
15. Q30(%) percentage of bases with the quality score greater than 30
16. GC(%) percentage of GC content
16. AvgQual average quality
17. GC(%) percentage of GC content
Attentions:
1. Sequence length metrics (sum_len, min_len, avg_len, max_len, Q1, Q2, Q3)
Expand Down Expand Up @@ -188,7 +190,7 @@ Tips:
"max_len",
}
if all {
colnames = append(colnames, []string{"Q1", "Q2", "Q3", "sum_gap", "N50", "Q20(%)", "Q30(%)", "GC(%)"}...)
colnames = append(colnames, []string{"Q1", "Q2", "Q3", "sum_gap", "N50", "Q20(%)", "Q30(%)", "AvgQual", "GC(%)"}...)
}

if hasNX {
Expand Down Expand Up @@ -236,14 +238,15 @@ Tips:
info.lenAvg,
info.lenMax)
if all {
fmt.Fprintf(outfh, "\t%.1f\t%.1f\t%.1f\t%d\t%d\t%.2f\t%.2f\t%.2f",
fmt.Fprintf(outfh, "\t%.1f\t%.1f\t%.1f\t%d\t%d\t%.2f\t%.2f\t%.2f\t%.2f",
info.Q1,
info.Q2,
info.Q3,
info.gapSum,
info.N50,
info.q20,
info.q30,
info.avgQual,
info.gc)
}
if hasNX {
Expand Down Expand Up @@ -275,14 +278,15 @@ Tips:
info.lenAvg,
info.lenMax)
if all {
fmt.Fprintf(outfh, "\t%.1f\t%.1f\t%.1f\t%d\t%d\t%.2f\t%.2f\t%.2f",
fmt.Fprintf(outfh, "\t%.1f\t%.1f\t%.1f\t%d\t%d\t%.2f\t%.2f\t%.2f\t%.2f",
info.Q1,
info.Q2,
info.Q3,
info.gapSum,
info.N50,
info.q20,
info.q30,
info.avgQual,
info.gc)
}
if hasNX {
Expand Down Expand Up @@ -322,14 +326,15 @@ Tips:
info.lenAvg,
info.lenMax)
if all {
fmt.Fprintf(outfh, "\t%.1f\t%.1f\t%.1f\t%d\t%d\t%.2f\t%.2f\t%.2f",
fmt.Fprintf(outfh, "\t%.1f\t%.1f\t%.1f\t%d\t%d\t%.2f\t%.2f\t%.2f\t%.2f",
info.Q1,
info.Q2,
info.Q3,
info.gapSum,
info.N50,
info.q20,
info.q30,
info.avgQual,
info.gc)
}
if hasNX {
Expand Down Expand Up @@ -391,8 +396,11 @@ Tips:

lensStats := util.NewLengthStats()

var errSum, avgQual float64
qual_map := seq.QUAL_MAP
var q20, q30 int64
var q byte
var qual int
var encodeOffset int = fqEncoding.Offset()
var seqFormat, t string
var record *fastx.Record
Expand Down Expand Up @@ -449,12 +457,15 @@ Tips:
if all {
if fastxReader.IsFastq {
for _, q = range record.Seq.Qual {
if int(q)-encodeOffset >= 20 {
qual = int(q) - encodeOffset
if qual >= 20 {
q20++
if int(q)-encodeOffset >= 30 {
if qual >= 30 {
q30++
}
}

errSum += qual_map[qual]
}
}

Expand All @@ -480,6 +491,12 @@ Tips:
n50 = lensStats.N50()
l50 = lensStats.L50()
q1, q2, q3 = lensStats.Q1(), lensStats.Q2(), lensStats.Q3()

if errSum == 0 {
avgQual = 0
} else {
avgQual = -10 * math.Log10(errSum/float64(lensStats.Sum()))
}
}
var nx []float64
if hasNX {
Expand All @@ -505,7 +522,7 @@ Tips:
0, 0, 0, 0,
0, 0, 0, 0,
0, 0, 0,
0, 0, 0,
0, 0, 0, 0,
nx,
nil, id}
} else {
Expand All @@ -517,10 +534,12 @@ Tips:
}
ch <- statInfo{file, seqFormat, t,
lensStats.Count(), lensStats.Sum(), gapSum, lensStats.Min(),
math.Round(lensStats.Mean(), 1), lensStats.Max(), n50, l50,
mathutil.Round(lensStats.Mean(), 1), lensStats.Max(), n50, l50,
q1, q2, q3,
math.Round(float64(q20)/float64(lensStats.Sum())*100, 2), math.Round(float64(q30)/float64(lensStats.Sum())*100, 2),
math.Round(float64(gcSum)/float64(lensStats.Sum())*100, 2),
mathutil.Round(float64(q20)/float64(lensStats.Sum())*100, 2),
mathutil.Round(float64(q30)/float64(lensStats.Sum())*100, 2),
mathutil.Round(avgQual, 2),
mathutil.Round(float64(gcSum)/float64(lensStats.Sum())*100, 2),
nx,
nil, id}
}
Expand Down Expand Up @@ -570,6 +589,7 @@ Tips:
{Header: "N50", Align: stable.AlignRight, HumanizeNumbers: true},
{Header: "Q20(%)", Align: stable.AlignRight, HumanizeNumbers: true},
{Header: "Q30(%)", Align: stable.AlignRight, HumanizeNumbers: true},
{Header: "AvgQual", Align: stable.AlignRight, HumanizeNumbers: true},
{Header: "GC(%)", Align: stable.AlignRight, HumanizeNumbers: true},
// {Header: "L50", AlignRight: true},
}...)
Expand Down Expand Up @@ -601,6 +621,7 @@ Tips:
row = append(row, info.N50)
row = append(row, info.q20)
row = append(row, info.q30)
row = append(row, info.avgQual)
row = append(row, info.gc)
}
if hasNX {
Expand Down Expand Up @@ -634,8 +655,9 @@ type statInfo struct {
Q2 float64
Q3 float64

q20 float64
q30 float64
q20 float64
q30 float64
avgQual float64

gc float64

Expand Down

0 comments on commit 19c5a65

Please sign in to comment.