nf-core · apeltzer · Jul 8, 2019 · May 29, 2019 · May 29, 2019 · May 29, 2019
diff --git a/.travis.yml b/.travis.yml
@@ -38,7 +38,9 @@ script:
   - nf-core lint ${TRAVIS_BUILD_DIR}
   # Lint the documentation
   - markdownlint ${TRAVIS_BUILD_DIR} -c ${TRAVIS_BUILD_DIR}/.github/markdownlint.yml
-  # Run, build reference genome with STAR
+  # Run with STAR
   - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker
-  # Run, build reference genome with HISAT2
+  # Run with HISAT2
   - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --aligner hisat2
+  # Run with STAR and Salmon
+  - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pseudo_aligner salmon
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,7 +4,13 @@
 
 ### Pipeline updates
 
+* Added tximport to merge salmon output
+* Added Salmon as an supplementary method to STAR and HiSAT2
+* Added `--psuedo_aligner`, `--transcript_fasta` and `--salmon_index` parameters
+* Add `Citation` and `Quick Start` section to `README.md`
+* Integrate changes in `nf-core/tools v1.6` template
 * Add tximport and summarizedexperiment dependency [#171](https://github.com/nf-core/rnaseq/issues/171)
+* Change all boolean parameters from snake_case to camelCase and vice versa for value parameters
 * Appointed changes because of missing output of the multiqc_plots folder [#200](https://github.com/nf-core/rnaseq/issues/200)
 * Add Qualimap dependency [#202](https://github.com/nf-core/rnaseq/issues/202)
 * Obtain edgeR + dupRadar version information [#198](https://github.com/nf-core/rnaseq/issues/198) and [#112](https://github.com/nf-core/rnaseq/issues/112)
@@ -22,6 +28,7 @@
 * qualimap 2.2.2b -> 2.2.2c
 * trim-galore 0.6.1 -> 0.6.2
 * gffread 0.9.12 -> 0.11.4
+* Force matplotlib=3.0.3
 * Added Salmon 0.14.0
 * Added RSEM 1.3.2
 * Added tximport 1.0.3
@@ -60,6 +67,8 @@
 * deeptools 3.2.0 -> 3.2.1
 * trim-galore 0.5.0 -> 0.6.1
 * qualimap 2.2.2b
+* matplotlib 3.0.3
+* r-base 3.5.1
 
 ## [Version 1.2](https://github.com/nf-core/rnaseq/releases/tag/1.2) - 2018-12-12
 

diff --git a/README.md b/README.md
@@ -7,15 +7,34 @@
 [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg)](http://bioconda.github.io/)
 [![Docker](https://img.shields.io/docker/automated/nfcore/rnaseq.svg)](https://hub.docker.com/r/nfcore/rnaseq/)
 
-
 ### Introduction
 
 **nf-core/rnaseq** is a bioinformatics analysis pipeline used for RNA sequencing data.
 
-The workflow processes raw data from FastQ inputs ([FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/), [Trim Galore!](https://www.bioinformatics.babraham.ac.uk/projects/trim_galore/)), aligns the reads ([STAR](https://github.com/alexdobin/STAR) or [HiSAT2](https://ccb.jhu.edu/software/hisat2/index.shtml)), generates gene counts ([featureCounts](http://bioinf.wehi.edu.au/featureCounts/), [StringTie](https://ccb.jhu.edu/software/stringtie/)) and performs extensive quality-control on the results ([RSeQC](http://rseqc.sourceforge.net/), [Qualimap](http://qualimap.bioinfo.cipf.es/), [dupRadar](https://bioconductor.org/packages/release/bioc/html/dupRadar.html), [Preseq](http://smithlabresearch.org/software/preseq/), [edgeR](https://bioconductor.org/packages/release/bioc/html/edgeR.html), [MultiQC](http://multiqc.info/)). See the [output documentation](docs/output.md) for more details of the results.
+The workflow processes raw data from FastQ inputs ([FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/), [Trim Galore!](https://www.bioinformatics.babraham.ac.uk/projects/trim_galore/)), aligns the reads ([STAR](https://github.com/alexdobin/STAR) or [HiSAT2](https://ccb.jhu.edu/software/hisat2/index.shtml)), generates counts relative to genes ([featureCounts](http://bioinf.wehi.edu.au/featureCounts/), [StringTie](https://ccb.jhu.edu/software/stringtie/)) or transcripts ([Salmon](https://combine-lab.github.io/salmon/), [tximport](https://bioconductor.org/packages/release/bioc/html/tximport.html)) and performs extensive quality-control on the results ([RSeQC](http://rseqc.sourceforge.net/), [Qualimap](http://qualimap.bioinfo.cipf.es/), [dupRadar](https://bioconductor.org/packages/release/bioc/html/dupRadar.html), [Preseq](http://smithlabresearch.org/software/preseq/), [edgeR](https://bioconductor.org/packages/release/bioc/html/edgeR.html), [MultiQC](http://multiqc.info/)). See the [output documentation](docs/output.md) for more details of the results.
 
 The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It comes with docker containers making installation trivial and results highly reproducible.
 
+## Quick Start
+
+i. Install [`nextflow`](https://nf-co.re/usage/installation)
+
+ii. Install one of [`docker`](https://docs.docker.com/engine/installation/), [`singularity`](https://www.sylabs.io/guides/3.0/user-guide/) or [`conda`](https://conda.io/miniconda.html)
+
+iii. Download the pipeline and test it on a minimal dataset with a single command
+
+```bash
+nextflow run nf-core/rnaseq -profile test,<docker/singularity/conda>
+```
+
+iv. Start running your own analysis!
+
+```bash
+nextflow run nf-core/rnaseq -profile <docker/singularity/conda> --reads '*_R{1,2}.fastq.gz' --genome GRCh37
+```
+
+See [usage docs](docs/usage.md) for all of the available options when running the pipeline.
+
 ### Documentation
 The nf-core/rnaseq pipeline comes with documentation about the pipeline, found in the `docs/` directory:
 
@@ -37,4 +56,15 @@ Many thanks to other who have helped out along the way too, including (but not l
 [@orzechoj](https://github.com/orzechoj),
 [@apeltzer](https://github.com/apeltzer),
 [@colindaven](https://github.com/colindaven),
-[@jburos](https://github.com/jburos).
+[@lpantano](https://github.com/lpantano),
+[@olgabot](https://github.com/olgabot),
+[@jburos](https://github.com/jburos),
+[@drpatelh](https://github.com/drpatelh).
+
+## Citation
+
+<!-- TODO nf-core: Add citation for pipeline after first release. Uncomment lines below and update Zenodo doi. -->
+If you use nf-core/rnaseq for your analysis, please cite it using the following doi: [10.5281/zenodo.1400710](https://doi.org/10.5281/zenodo.1400710)
+
+You can cite the `nf-core` pre-print as follows:  
+Ewels PA, Peltzer A, Fillinger S, Alneberg JA, Patel H, Wilm A, Garcia MU, Di Tommaso P, Nahnsen S. **nf-core: Community curated bioinformatics pipelines**. *bioRxiv*. 2019. p. 610741. [doi: 10.1101/610741](https://www.biorxiv.org/content/10.1101/610741v1).
diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yaml
@@ -3,6 +3,7 @@ extra_fn_clean_exts:
     - _R2
     - .hisat
     - '.sorted.markDups'
+    - '.sorted'
 
 report_comment: >
     This report has been generated by the <a href="https://github.com/nf-core/rnaseq" target="_blank">nf-core/rnaseq</a>

diff --git a/bin/parse_gtf.py b/bin/parse_gtf.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python
+from __future__ import print_function
+from collections import OrderedDict, defaultdict, Counter
+import logging
+import argparse
+import glob
+import os
+
+# Create a logger
+logging.basicConfig(format='%(name)s - %(asctime)s %(levelname)s: %(message)s')
+logger = logging.getLogger(__file__)
+logger.setLevel(logging.INFO)
+
+
+def read_top_transcript(salmon):
+    txs = set()
+    fn = glob.glob(os.path.join(salmon, "*", "quant.sf"))[1]
+    with open(fn) as inh:
+        for line in inh:
+            if line.startswith("Name"):
+                continue
+            txs.add(line.split()[0])
+            if len(txs) > 100:
+                break
+    logger.info("Transcripts found in FASTA: %s" % txs)
+    return txs
+
+
+def tx2gene(gtf, salmon, gene_id, extra, out):
+    txs = read_top_transcript(salmon)
+    votes = Counter()
+    gene_dict = defaultdict(dict)
+    with open(gtf) as inh:
+        for line in inh:
+            if line.startswith("#"):
+                continue
+            cols = line.split("\t")
+            attr_dict = OrderedDict()
+            for gff_item in cols[8].split(";"):
+                item_pair = gff_item.strip().split(" ")
+                if len(item_pair) > 1:
+                    value = item_pair[1].strip().replace("\"", "")
+                    if value in txs:
+                        votes[item_pair[0].strip()] += 1
+
+                    attr_dict[item_pair[0].strip()] = value
+            gene_dict[attr_dict[gene_id]] = attr_dict
+
+    if not votes:
+        logger.warning("No attribute in GTF matching transcripts")
+        return None
+
+    txid = votes.most_common(1)[0][0]
+    logger.info("Attributed found to be transcript: %s" % txid)
+    with open(out, 'w') as outh:
+        for gene in gene_dict:
+            print("%s,%s,%s" % (gene_dict[gene][txid], gene, gene_dict[gene][extra]), file=outh)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="""Get tx to gene names for tximport""")
+    parser.add_argument("--gtf", type=str, help="GTF file")
+    parser.add_argument("--salmon", type=str, help="output of salmon")
+    parser.add_argument("--id", type=str, help="gene id in the gtf file")
+    parser.add_argument("--extra", type=str, help="extra id in the gtf file")
+    parser.add_argument("-o", "--output", dest='output', default='tx2gene.csv', type=str, help="file with output")
+
+    args = parser.parse_args()
+    tx2gene(args.gtf, args.salmon, args.id, args.extra, args.output)
diff --git a/bin/scrape_software_versions.py b/bin/scrape_software_versions.py
@@ -14,6 +14,7 @@
     'Picard MarkDuplicates': ['v_markduplicates.txt', r"([\d\.]+)-SNAPSHOT"],
     'Samtools': ['v_samtools.txt', r"samtools (\S+)"],
     'featureCounts': ['v_featurecounts.txt', r"featureCounts v(\S+)"],
+    'Salmon': ['v_salmon.txt', r"salmon (\S+)"],
     'deepTools': ['v_deeptools.txt', r"bamCoverage (\S+)"],
     'StringTie': ['v_stringtie.txt', r"(\S+)"],
     'Preseq': ['v_preseq.txt', r"Version: (\S+)"],
@@ -34,6 +35,7 @@
 results['Picard MarkDuplicates'] = '<span style="color:#999999;\">N/A</span>'
 results['Samtools'] = '<span style="color:#999999;\">N/A</span>'
 results['featureCounts'] = '<span style="color:#999999;\">N/A</span>'
+results['Salmon'] = '<span style="color:#999999;\">N/A</span>'
 results['StringTie'] = '<span style="color:#999999;\">N/A</span>'
 results['Preseq'] = '<span style="color:#999999;\">N/A</span>'
 results['deepTools'] = '<span style="color:#999999;\">N/A</span>'

diff --git a/bin/tximport.r b/bin/tximport.r
@@ -0,0 +1,73 @@
+#!/usr/bin/env Rscript
+
+args = commandArgs(trailingOnly=TRUE)
+if (length(args) < 2) {
+  stop("Usage: tximeta.r <coldata> <salmon_out>", call.=FALSE)
+}
+
+path = args[2]
+coldata = args[1]
+
+tx2gene = "tx2gene.csv"
+info = file.info(tx2gene)
+if (info$size == 0){
+  tx2gene = NULL
+}else{
+  rowdata = read.csv(tx2gene, header = FALSE)
+  colnames(rowdata) = c("tx", "gene_id", "gene_name")
+  tx2gene = rowdata[,1:2]
+}
+
+fns = list.files(path, pattern = "quant.sf", recursive = T, full.names = T)
+names = basename(dirname(fns))
+names(fns) = names
+coldata = list.files(coldata, full.names = TRUE)
+if (length(coldata)==0){
+  coldata = "NULL"
+}
+if (file.exists(coldata)){
+    coldata = read.csv(coldata)
+    coldata = coldata[match(names, coldata[,1]),]
+    coldata = cbind(files = fns, coldata)
+}else{
+    message("ColData not avaliable ", coldata)
+    coldata = data.frame(files = fns, names = names)
+}
+
+library(SummarizedExperiment)
+
+# if not genome version is giving
+library(tximport)
+
+txi = tximport(fns, type = "salmon", txOut = TRUE)
+rownames(coldata) = coldata[["names"]]
+rowdata = rowdata[match(rownames(txi[[1]]), rowdata[["tx"]]),]
+se = SummarizedExperiment(assays = list(counts = txi[["counts"]],
+                                        abundance = txi[["abundance"]],
+                                        length = txi[["length"]]),
+                          colData = DataFrame(coldata),
+                          rowData = rowdata)
+if (!is.null(tx2gene)){
+  gi = summarizeToGene(txi, tx2gene = tx2gene)
+  growdata = unique(rowdata[,2:3])
+  growdata = growdata[match(rownames(gi[[1]]), growdata[["gene_id"]]),]
+  gse = SummarizedExperiment(assays = list(counts = gi[["counts"]],
+                                          abundance = gi[["abundance"]],
+                                          length = gi[["length"]]),
+                            colData = DataFrame(coldata),
+                            rowData = growdata)
+}
+
+if(exists("gse")){
+  saveRDS(gse, file = "gse.rds")
+  write.csv(assays(se)[["abundance"]], "merged_salmon_gene_tpm.csv")
+  write.csv(assays(se)[["counts"]], "merged_salmon_gene_reads.csv")
+}
+
+saveRDS(se, file = "se.rds")
+write.csv(assays(se)[["abundance"]], "merged_salmon_tx_tpm.csv")
+write.csv(assays(se)[["counts"]], "merged_salmon_tx_reads.csv")
+
+# Print sessioninfo to standard out
+citation("tximeta")
+sessionInfo()
diff --git a/conf/base.config b/conf/base.config
@@ -20,20 +20,6 @@ process {
   maxErrors = '-1'
 
   // Process-specific resource requirements
-  withName: trim_galore {
-    time = { check_max( 8.h * task.attempt, 'time' ) }
-  }
-  withName:markDuplicates {
-	  // Actually the -Xmx value should be kept lower,
-    // and is set through the markdup_java_options
-    cpus = { check_max( 8, 'cpus' ) }
-    memory = { check_max( 8.GB * task.attempt, 'memory' ) }
-  }
-  withName: makeHISATindex {
-    cpus = { check_max( 10, 'cpus' ) }
-    memory = { check_max( 200.GB * task.attempt, 'memory' ) }
-    time = { check_max( 5.h * task.attempt, 'time' ) }
-  }
   withLabel: low_memory {
     memory = { check_max( 16.GB * task.attempt, 'memory' ) }
   }
@@ -46,7 +32,26 @@ process {
     memory = { check_max( 80.GB * task.attempt, 'memory' ) }
     time = { check_max( 8.h * task.attempt, 'time' ) }
   }
-  withName: "multiqc|get_software_versions" {
+
+  withName: makeHISATindex {
+    cpus = { check_max( 10, 'cpus' ) }
+    memory = { check_max( 200.GB * task.attempt, 'memory' ) }
+    time = { check_max( 5.h * task.attempt, 'time' ) }
+  }
+  withName: trim_galore {
+    time = { check_max( 8.h * task.attempt, 'time' ) }
+  }
+  withName: markDuplicates {
+    // Actually the -Xmx value should be kept lower,
+    // and is set through the markdup_java_options
+    cpus = { check_max( 8, 'cpus' ) }
+    memory = { check_max( 8.GB * task.attempt, 'memory' ) }
+  }
+  withLabel: salmon {
+    cpus = { check_max( 8, 'cpus' ) }
+    memory = { check_max( 16.GB * task.attempt, 'memory' ) }
+  }
+  withName: 'multiqc|get_software_versions' {
     memory = { check_max( 2.GB * task.attempt, 'memory' ) }
     cache = false
   }

diff --git a/conf/test.config b/conf/test.config
@@ -27,4 +27,5 @@ params {
   // Genome references
   fasta = 'https://github.com/nf-core/test-datasets/raw/rnaseq/reference/genome.fa'
   gtf = 'https://github.com/nf-core/test-datasets/raw/rnaseq/reference/genes.gtf'
+  transcriptome = 'https://github.com/nf-core/test-datasets/raw/rnaseq/reference/transcriptome.fasta'
 }