Merge pull request #8 from BovReg/bovreg_stringtie_annotation

Bovreg_stringtie_annotation -> Master for v3.3-BOVREG-1
BovReg · Aug 8, 2022 · e00d2ca · e00d2ca
2 parents b3ff92b + b1e00d2
commit e00d2ca
Show file tree

Hide file tree

Showing 158 changed files with 1,443 additions and 216 deletions.
diff --git a/.editorconfig b/.editorconfig
@@ -11,6 +11,9 @@ indent_style = space
 [*.{yml,yaml}]
 indent_size = 2
 
+[*.json]
+insert_final_newline = unset
+
 # These files are edited and tested upstream in nf-core/modules
 [/modules/nf-core/**]
 charset = unset

diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
@@ -74,7 +74,7 @@ If you wish to contribute a new step, please use the following coding standards:
 7. Add sanity checks for all relevant parameters.
 8. Add any new software to the `scrape_software_versions.py` script in `bin/` and the version command to the `scrape_software_versions` process in `main.nf`.
 9. Do local tests that the new code works properly and as expected.
-10. Add a new test command in `.github/workflow/ci.yaml`.
+10. Add a new test command in `.github/workflow/ci.yml`.
 11. If applicable add a [MultiQC](https://https://multiqc.info/) module.
 12. Update MultiQC config `assets/multiqc_config.yaml` so relevant suffixes, name clean up, General Statistics Table column order, and module figures are in the right order.
 13. Optional: Add any descriptions of MultiQC report sections and output files to `docs/output.md`.

diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -18,7 +18,7 @@ Please delete this text and anything that's not relevant from the template below
 I have checked the following places for your error:
 
 - [ ] [nf-core website: troubleshooting](https://nf-co.re/usage/troubleshooting)
-- [ ] [nf-core/rnaseq pipeline documentation](https://nf-co.re/nf-core/rnaseq/usage)
+- [ ] [nf-core/rnaseq pipeline documentation](https://nf-co.re/rnaseq/usage)
 
 ## Description of the bug
 
@@ -51,13 +51,12 @@ Have you provided the following extra information/files:
 
 ## Nextflow Installation
 
-- Version: <!-- [e.g. 19.10.0] -->
+- Version: <!-- [e.g. 21.04.0] -->
 
 ## Container engine
 
 - Engine: <!-- [e.g. Conda, Docker, Singularity, Podman, Shifter or Charliecloud] -->
 - version: <!-- [e.g. 1.0.0] -->
-- Image tag: <!-- [e.g. nfcore/rnaseq:1.0.0] -->
 
 ## Additional context
 

diff --git a/.github/workflows/awsfulltest.yml b/.github/workflows/awsfulltest.yml
@@ -1,44 +1,34 @@
 name: nf-core AWS full size tests
 # This workflow is triggered on published releases.
-# It can be additionally triggered manually with GitHub actions workflow dispatch.
+# It can be additionally triggered manually with GitHub actions workflow dispatch button.
 # It runs the -profile 'test_full' on AWS batch
 
 on:
   release:
     types: [published]
   workflow_dispatch:
-
-env:
-  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
-  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-  TOWER_ACCESS_TOKEN: ${{ secrets.AWS_TOWER_TOKEN }}
-  AWS_JOB_DEFINITION: ${{ secrets.AWS_JOB_DEFINITION }}
-  AWS_JOB_QUEUE: ${{ secrets.AWS_JOB_QUEUE }}
-  AWS_S3_BUCKET: ${{ secrets.AWS_S3_BUCKET }}
-
 jobs:
-  run-awstest:
+  run-tower:
     name: Run AWS full tests
     if: github.repository == 'nf-core/rnaseq'
     runs-on: ubuntu-latest
+    # Do a full-scale run with each of the three aligners
     strategy:
       matrix:
-        aligner: ['star_salmon', 'star_rsem', 'hisat2']
+        aligner: ["star_salmon", "star_rsem", "hisat2"]
     steps:
-      - name: Setup Miniconda
-        uses: conda-incubator/setup-miniconda@v2
+      - name: Launch workflow via tower
+        uses: nf-core/tower-action@master
         with:
-          auto-update-conda: true
-          python-version: 3.7
-      - name: Install awscli
-        run: conda install -c conda-forge awscli
-      - name: Start AWS batch job
-        # Do a full-scale run with each of the three aligners
-        run: |
-          aws batch submit-job \
-            --region eu-west-1 \
-            --job-name nf-core-rnaseq \
-            --job-queue $AWS_JOB_QUEUE \
-            --job-definition $AWS_JOB_DEFINITION \
-            --container-overrides '{"command": ["nf-core/rnaseq", "-r '"${GITHUB_SHA}"' -profile test_full --aligner '"${{matrix.aligner}}"' --outdir s3://'"${AWS_S3_BUCKET}"'/rnaseq/results-'"${GITHUB_SHA}"'/aligner_'"${{matrix.aligner}}"' -w s3://'"${AWS_S3_BUCKET}"'/rnaseq/work-'"${GITHUB_SHA}"'/'"${{matrix.aligner}}"' -with-tower"], "environment": [{"name": "TOWER_ACCESS_TOKEN", "value": "'"$TOWER_ACCESS_TOKEN"'"}]}'
-
+          workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }}
+          bearer_token: ${{ secrets.TOWER_BEARER_TOKEN }}
+          compute_env: ${{ secrets.TOWER_COMPUTE_ENV }}
+          pipeline: ${{ github.repository }}
+          revision: ${{ github.sha }}
+          workdir: s3://${{ secrets.AWS_S3_BUCKET }}/work/rnaseq/work-${{ github.sha }}
+          parameters: |
+            {
+              "outdir" : "s3://${{ secrets.AWS_S3_BUCKET }}/rnaseq/results-${{ github.sha }}/aligner_${{ matrix.aligner }}",
+              "aligner": "${{ matrix.aligner }}"
+            }
+          profiles: '[ "test_full", "aws_tower" ]'
diff --git a/.github/workflows/awstest.yml b/.github/workflows/awstest.yml
@@ -1,37 +1,27 @@
 name: nf-core AWS test
-# This workflow is triggered on push to the master branch.
-# It can be additionally triggered manually with GitHub actions workflow dispatch.
-# It runs the -profile 'test' on AWS batch.
+# This workflow can be triggered manually with the GitHub actions workflow dispatch button.
+# It runs the -profile 'test' on AWS batch
 
 on:
   workflow_dispatch:
-
-env:
-  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
-  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-  TOWER_ACCESS_TOKEN: ${{ secrets.AWS_TOWER_TOKEN }}
-  AWS_JOB_DEFINITION: ${{ secrets.AWS_JOB_DEFINITION }}
-  AWS_JOB_QUEUE: ${{ secrets.AWS_JOB_QUEUE }}
-  AWS_S3_BUCKET: ${{ secrets.AWS_S3_BUCKET }}
-
 jobs:
-  run-awstest:
+  run-tower:
     name: Run AWS tests
     if: github.repository == 'nf-core/rnaseq'
     runs-on: ubuntu-latest
     steps:
-      - name: Setup Miniconda
-        uses: conda-incubator/setup-miniconda@v2
+      - name: Launch workflow via tower
+        uses: nf-core/tower-action@master
+
         with:
-          auto-update-conda: true
-          python-version: 3.7
-      - name: Install awscli
-        run: conda install -c conda-forge awscli
-      - name: Start AWS batch job
-        run: |
-          aws batch submit-job \
-          --region eu-west-1 \
-          --job-name nf-core-rnaseq \
-          --job-queue $AWS_JOB_QUEUE \
-          --job-definition $AWS_JOB_DEFINITION \
-          --container-overrides '{"command": ["nf-core/rnaseq", "-r '"${GITHUB_SHA}"' -profile test --outdir s3://'"${AWS_S3_BUCKET}"'/rnaseq/results-'"${GITHUB_SHA}"' -w s3://'"${AWS_S3_BUCKET}"'/rnaseq/work-'"${GITHUB_SHA}"' -with-tower"], "environment": [{"name": "TOWER_ACCESS_TOKEN", "value": "'"$TOWER_ACCESS_TOKEN"'"}]}'
+          workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }}
+          bearer_token: ${{ secrets.TOWER_BEARER_TOKEN }}
+          compute_env: ${{ secrets.TOWER_COMPUTE_ENV }}
+          pipeline: ${{ github.repository }}
+          revision: ${{ github.sha }}
+          workdir: s3://${{ secrets.AWS_S3_BUCKET }}/work/rnaseq/work-${{ github.sha }}
+          parameters: |
+            {
+              "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/rnaseq/results-${{ github.sha }}"
+            }
+          profiles: '[ "test", "aws_tower" ]'
diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml
@@ -14,7 +14,7 @@ jobs:
       - uses: actions/checkout@v2
       - uses: actions/setup-node@v1
         with:
-          node-version: "10"
+          node-version: '10'
       - name: Install markdownlint
         run: npm install -g markdownlint-cli
       - name: Run Markdownlint
@@ -35,7 +35,7 @@ jobs:
                 * On Mac: `brew install markdownlint-cli`
                 * Everything else: [Install `npm`](https://www.npmjs.com/get-npm) then [install `markdownlint-cli`](https://www.npmjs.com/package/markdownlint-cli) (`npm install -g markdownlint-cli`)
             * Fix the markdown errors
-                * Automatically: `markdownlint . --config .github/markdownlint.yml --fix`
+                * Automatically: `markdownlint . --fix`
                 * Manually resolve anything left from `markdownlint .`
 
             Once you push these changes the test should pass, and you can hide this comment :+1:
@@ -67,7 +67,7 @@ jobs:
       - uses: actions/checkout@v1
       - uses: actions/setup-node@v1
         with:
-          node-version: "10"
+          node-version: '10'
       - name: Install yaml-lint
         run: npm install -g yaml-lint
       - name: Run yaml-lint
@@ -101,6 +101,7 @@ jobs:
   nf-core:
     runs-on: ubuntu-latest
     steps:
+
       - name: Check out pipeline code
         uses: actions/checkout@v2
 
@@ -113,8 +114,8 @@ jobs:
 
       - uses: actions/setup-python@v1
         with:
-          python-version: "3.6"
-          architecture: "x64"
+          python-version: '3.6'
+          architecture: 'x64'
 
       - name: Install dependencies
         run: |
@@ -126,7 +127,7 @@ jobs:
           GITHUB_COMMENTS_URL: ${{ github.event.pull_request.comments_url }}
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           GITHUB_PR_COMMIT: ${{ github.event.pull_request.head.sha }}
-        run: nf-core -l lint_log.txt lint ${GITHUB_WORKSPACE} --markdown lint_results.md
+        run: nf-core -l lint_log.txt lint --dir ${GITHUB_WORKSPACE} --markdown lint_results.md
 
       - name: Save PR number
         if: ${{ always() }}
@@ -141,3 +142,4 @@ jobs:
             lint_log.txt
             lint_results.md
             PR_number.txt
+
diff --git a/.gitignore b/.gitignore
@@ -3,7 +3,6 @@ work/
 data/
 results/
 .DS_Store
-tests/
 testing/
 testing*
 *.pyc
diff --git a/.nf-core-lint.yaml b/.nf-core-lint.yaml
diff --git a/.nf-core.yml b/.nf-core.yml
@@ -0,0 +1,8 @@
+lint:
+  files_unchanged:
+    - .markdownlint.yml
+    - assets/email_template.html
+    - assets/email_template.txt
+    - bin/scrape_software_versions.py
+    - lib/NfcoreTemplate.groovy
+    - assets/multiqc_config.yaml
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,36 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [[3.3-BOVREG-1](https://github.com/BovReg/rnaseq/releases/tag/3.3-BOVREG-1)] - 2022-08-08
+
+### Enhancements & fixes
+
+* Add Stringtie merge annotation based on the [TAGADA pipeline](https://github.com/FAANG/analysis-TAGADA)
+* Add Stringtie quantification based on the [TAGADA pipeline](https://github.com/FAANG/analysis-TAGADA)
+* Add [FEELnc](https://github.com/tderrien/FEELnc) to the pipeline to annotate lncRNAs based on the [TAGADA pipeline](https://github.com/FAANG/analysis-TAGADA)
+
+## [[3.3](https://github.com/nf-core/rnaseq/releases/tag/3.3)] - 2021-07-26
+
+### Enhancements & fixes
+
+* Updated pipeline template to [nf-core/tools 2.0.1](https://github.com/nf-core/tools/releases/tag/2.0.1)
+* [[#668](https://github.com/nf-core/rnaseq/issues/668)] - Salmon quant with UMI-tools does not work
+* [[#674](https://github.com/nf-core/rnaseq/issues/674)] - Launch pipeline regex fails
+
+### Software dependencies
+
+Note, since the pipeline is now using Nextflow DSL2, each process will be run with its own [Biocontainer](https://biocontainers.pro/#/registry). This means that on occasion it is entirely possible for the pipeline to be using different versions of the same tool. However, the overall software dependency changes compared to the last release have been listed below for reference.
+
+| Dependency  | Old version | New version |
+|-------------|-------------|-------------|
+| `umi_tools` | 1.1.1       | 1.1.2       |
+| `samtools`  | 1.10        | 1.12        |
+| `stringtie` | 2.1.4       | 2.1.7       |
+
+> **NB:** Dependency has been __updated__ if both old and new version information is present.
+> **NB:** Dependency has been __added__ if just the new version information is present.
+> **NB:** Dependency has been __removed__ if version information isn't present.
+
 ## [[3.2](https://github.com/nf-core/rnaseq/releases/tag/3.2)] - 2021-06-18
 
 ### Enhancements & fixes

diff --git a/README.md b/README.md
@@ -39,15 +39,16 @@ The SRA download functionality has been removed from the pipeline (`>=3.2`) and
 8. UMI-based deduplication ([`UMI-tools`](https://github.com/CGATOxford/UMI-tools))
 9. Duplicate read marking ([`picard MarkDuplicates`](https://broadinstitute.github.io/picard/))
 10. Transcript assembly and quantification ([`StringTie`](https://ccb.jhu.edu/software/stringtie/))
-11. Create bigWig coverage files ([`BEDTools`](https://github.com/arq5x/bedtools2/), [`bedGraphToBigWig`](http://hgdownload.soe.ucsc.edu/admin/exe/))
+11. Predict lncRNAs ([`FEELnc`](https://github.com/tderrien/FEELnc))
+12. Create bigWig coverage files ([`BEDTools`](https://github.com/arq5x/bedtools2/), [`bedGraphToBigWig`](http://hgdownload.soe.ucsc.edu/admin/exe/))
 12. Extensive quality control:
     1. [`RSeQC`](http://rseqc.sourceforge.net/)
     2. [`Qualimap`](http://qualimap.bioinfo.cipf.es/)
     3. [`dupRadar`](https://bioconductor.org/packages/release/bioc/html/dupRadar.html)
     4. [`Preseq`](http://smithlabresearch.org/software/preseq/)
     5. [`DESeq2`](https://bioconductor.org/packages/release/bioc/html/DESeq2.html)
-13. Pseudo-alignment and quantification ([`Salmon`](https://combine-lab.github.io/salmon/); *optional*)
-14. Present QC for raw read, alignment, gene biotype, sample similarity, and strand-specificity checks ([`MultiQC`](http://multiqc.info/), [`R`](https://www.r-project.org/))
+12. Pseudo-alignment and quantification ([`Salmon`](https://combine-lab.github.io/salmon/); *optional*)
+15. Present QC for raw read, alignment, gene biotype, sample similarity, and strand-specificity checks ([`MultiQC`](http://multiqc.info/), [`R`](https://www.r-project.org/))
 
 > * **NB:** Quantification isn't performed if using `--aligner hisat2` due to the lack of an appropriate option to calculate accurate expression estimates from HISAT2 derived genomic alignments. However, you can use this route if you have a preference for the alignment, QC and other types of downstream analysis compatible with the output of HISAT2.
 > * **NB:** The `--aligner star_rsem` option will require STAR indices built from version 2.7.6a or later. However, in order to support legacy usage of genomes hosted on AWS iGenomes the `--aligner star_salmon` option requires indices built with STAR 2.6.1d or earlier. Please refer to this [issue](https://github.com/nf-core/rnaseq/issues/498) for further details.
@@ -94,7 +95,9 @@ These scripts were originally written for use at the [National Genomics Infrastr
 
 The pipeline was re-written in Nextflow DSL2 by Harshil Patel ([@drpatelh](https://github.com/drpatelh)) from [The Bioinformatics & Biostatistics Group](https://www.crick.ac.uk/research/science-technology-platforms/bioinformatics-and-biostatistics/) at [The Francis Crick Institute](https://www.crick.ac.uk/), London.
 
-Many thanks to other who have helped out along the way too, including (but not limited to):
+The pipeline was readapted to be used in the framework of the [BovReg](https://www.bovreg.eu/) project by Jose Espinosa-Carrasco ([@joseespinosa](https://github.com/joseespinosa)) and Björn Langer ([@bjlang](https://github.com/bjlang)). The main addition needed by the project was the annotation and quantification of de novo annotated transcripts using StringTie and the prediction of lncRNAs using FEELnc and the output of StringTie. Both features were implemented using as inspiration the [TAGADA pipeline](https://github.com/FAANG/analysis-TAGADA/blob/master/main.nf) from the [GENE-SWitCH](https://www.gene-switch.eu/) project. Both the BovReg and the GENE-SWitCH projects form part of the [EuroFAANG](https://eurofaang.eu/) effort to annotate the genome of farmed animals.
+
+Many thanks to others who have helped out along the way too, including (but not limited to):
 [@Galithil](https://github.com/Galithil),
 [@pditommaso](https://github.com/pditommaso),
 [@orzechoj](https://github.com/orzechoj),
@@ -123,3 +126,9 @@ You can cite the `nf-core` publication as follows:
 > Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.
 >
 > _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).
+
+## Funding
+
+BovReg project has received funding from the European Union’s Horizon 2020 research and innovation program under Grant Agreement ID. 815668.
+
+This repository reflects only the listed contributors views. Neither the European Commission nor its Agency REA are responsible for any use that may be made of the information it contains.
diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -1,6 +1,6 @@
 {
     "$schema": "http://json-schema.org/draft-07/schema",
-    "$id": "https://raw.githubusercontent.com/nf-core/rnaseq/master/schema_input.json",
+    "$id": "https://raw.githubusercontent.com/nf-core/rnaseq/master/assets/schema_input.json",
     "title": "nf-core/rnaseq pipeline - params.input schema",
     "description": "Schema for the file provided with params.input",
     "type": "array",

diff --git a/bin/add_biotype.py b/bin/add_biotype.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python
+
+import sys
+import re
+import argparse
+
+def __get_arguments(args=None):
+    __description__ = "Scan annotations file for biotypes and append the corresponding transcript_biotype field to gene file entries"
+    __Epilog__ = "Example usage: python add_biotype.py <ANNOTATION_GTF> <REFERENCE_GFF>"
+
+    parser = argparse.ArgumentParser(description=__description__, epilog=__Epilog__)
+    parser.add_argument("ANNOTATION_GTF", type=str, help="annotation file")
+    parser.add_argument("REFERENCE_GFF",  type=str, help="reference genes file")
+    return parser.parse_args(args)
+
+def add_biotype(annotation_gtf, reference_gff):
+    """
+    This function assigns the transcript biotype to the de novo transcripts
+    of the annotation file if already present in the reference annotation
+
+    This script is adapted from the FAANG/analysis-TAGADA pipeline
+    """
+
+    args = __get_arguments()
+    biotypes = {}
+
+    with open(annotation_gtf) as annot_fh:
+        for line in annot_fh:
+            if line.startswith('#'): continue
+            fields = line.strip().split('\t')
+            if fields[2] != "transcript": continue
+            tId = re.search('transcript_id "([^;]*)";*', fields[8] )
+            biotype = re.search('biotype "([^;]*)";*', fields[8] )
+            if tId.group(1) not in biotypes:
+                biotypes[tId.group(1)] = biotype.group(1)
+
+    with open(reference_gff) as genes_fh:
+        for line in genes_fh:
+            line = line.strip('\n')
+            fields = line.split('\t')
+            if not fields[0].startswith('#') and fields[2] != "gene":
+                tId = re.search('transcript_id "([^;]*)";*', fields[8] )
+                if tId.group(1) in biotypes:
+                    print(line, 'transcript_biotype "{}";'.format(biotypes[tId.group(1)]))
+                else:
+                    print(line)
+            else:
+                print(line)
+
+def main(args=None):
+    args = __get_arguments(args)
+    add_biotype(args.ANNOTATION_GTF, args.REFERENCE_GFF)
+
+if __name__ == "__main__":
+    sys.exit(main())