nf-core · pinin4fjords · Jan 29, 2024 · Dec 20, 2023 · Dec 21, 2023 · Dec 21, 2023
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -18,11 +18,11 @@
                 "python.linting.flake8Path": "/opt/conda/bin/flake8",
                 "python.linting.pycodestylePath": "/opt/conda/bin/pycodestyle",
                 "python.linting.pydocstylePath": "/opt/conda/bin/pydocstyle",
-                "python.linting.pylintPath": "/opt/conda/bin/pylint"
+                "python.linting.pylintPath": "/opt/conda/bin/pylint",
             },
 
             // Add the IDs of extensions you want installed when the container is created.
-            "extensions": ["ms-python.python", "ms-python.vscode-pylance", "nf-core.nf-core-extensionpack"]
-        }
-    }
+            "extensions": ["ms-python.python", "ms-python.vscode-pylance", "nf-core.nf-core-extensionpack"],
+        },
+    },
 }
diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml
@@ -1,6 +1,6 @@
 report_comment: >
-  This report has been generated by the <a href="https://github.com/nf-core/riboseq/tree/dev" target="_blank">nf-core/riboseq</a>
-  analysis pipeline. For information about how to interpret these results, please see the
+
+  This report has been generated by the <a href="https://github.com/nf-core/riboseq/tree/dev" target="_blank">nf-core/riboseq</a> analysis pipeline. For information about how to interpret these results, please see the
   <a href="https://nf-co.re/riboseq/dev/docs/output" target="_blank">documentation</a>.
 report_section_order:
   "nf-core-riboseq-methods-description":
@@ -11,3 +11,169 @@ report_section_order:
     order: -1002
 
 export_plots: true
+
+# Run only these modules
+run_modules:
+  - custom_content
+  - fastqc
+  - cutadapt
+  - fastp
+  - sortmerna
+  - star
+  - hisat2
+  - rsem
+  - salmon
+  - kallisto
+  - samtools
+  - picard
+  - preseq
+  - rseqc
+  - qualimap
+
+# Order of modules
+top_modules:
+  - "fail_trimmed_samples"
+  - "fail_mapped_samples"
+  - "fail_strand_check"
+  - "star_rsem_deseq2_pca"
+  - "star_rsem_deseq2_clustering"
+  - "star_salmon_deseq2_pca"
+  - "star_salmon_deseq2_clustering"
+  - "salmon_deseq2_pca"
+  - "salmon_deseq2_clustering"
+  - "biotype_counts"
+  - "dupradar"
+
+module_order:
+  - fastqc:
+      name: "FastQC (raw)"
+      info: "This section of the report shows FastQC results before adapter trimming."
+      path_filters:
+        - "./fastqc/raw/*.zip"
+  - cutadapt
+  - fastp
+  - fastqc:
+      name: "FastQC (trimmed)"
+      info: "This section of the report shows FastQC results after adapter trimming."
+      path_filters:
+        - "./fastqc/trim/*.zip"
+
+# Don't show % Dups in the General Stats table (we have this from Picard)
+table_columns_visible:
+  fastqc:
+    percent_duplicates: False
+
+extra_fn_clean_exts:
+  - ".umi_dedup"
+  - "_val"
+  - ".markdup"
+  - "_primary"
+
+# Customise the module search patterns to speed up execution time
+#  - Skip module sub-tools that we are not interested in
+#  - Replace file-content searching with filename pattern searching
+#  - Don't add anything that is the same as the MultiQC default
+# See https://multiqc.info/docs/#optimise-file-search-patterns for details
+sp:
+  cutadapt:
+    fn: "*trimming_report.txt"
+
+  fastp:
+    fn: "*.fastp.json"
+
+  sortmerna:
+    fn: "*.sortmerna.log"
+
+  hisat2:
+    fn: "*.hisat2.summary.log"
+
+  salmon/meta:
+    fn: "meta_info.json"
+
+  preseq:
+    fn: "*.lc_extrap.txt"
+
+  samtools/stats:
+    fn: "*.stats"
+  samtools/flagstat:
+    fn: "*.flagstat"
+  samtools/idxstats:
+    fn: "*.idxstats*"
+
+  rseqc/bam_stat:
+    fn: "*.bam_stat.txt"
+  rseqc/gene_body_coverage:
+    skip: true
+  rseqc/junction_annotation:
+    fn: "*.junction_annotation.log"
+  rseqc/read_gc:
+    skip: true
+  rseqc/read_distribution:
+    fn: "*.read_distribution.txt"
+  rseqc/infer_experiment:
+    fn: "*.infer_experiment.txt"
+  rseqc/tin:
+    fn: "*.summary.txt"
+
+  picard/markdups:
+    fn: "*.MarkDuplicates.metrics.txt"
+  picard/alignment_metrics:
+    skip: true
+  picard/basedistributionbycycle:
+    skip: true
+  picard/gcbias:
+    skip: true
+  picard/hsmetrics:
+    skip: true
+  picard/insertsize:
+    skip: true
+  picard/oxogmetrics:
+    skip: true
+  picard/pcr_metrics:
+    skip: true
+  picard/quality_by_cycle:
+    skip: true
+  picard/quality_score_distribution:
+    skip: true
+  picard/quality_yield_metrics:
+    skip: true
+  picard/rnaseqmetrics:
+    skip: true
+  picard/rrbs_metrics:
+    skip: true
+  picard/sam_file_validation:
+    skip: true
+  picard/variant_calling_metrics:
+    skip: true
+  picard/wgs_metrics:
+    skip: true
+
+# See https://github.com/ewels/MultiQC_TestData/blob/master/data/custom_content/with_config/table_headerconfig/multiqc_config.yaml
+custom_data:
+  fail_trimmed_samples:
+    section_name: "WARNING: Fail Trimming Check"
+    description: "List of samples that failed the minimum trimmed reads threshold specified via the '--min_trimmed_reads' parameter, and hence were ignored for the downstream processing steps."
+    plot_type: "table"
+    pconfig:
+      id: "fail_trimmed_samples_table"
+      table_title: "Samples failed trimming threshold"
+      namespace: "Samples failed trimming threshold"
+      format: "{:.0f}"
+  fail_mapped_samples:
+    section_name: "WARNING: Fail Alignment Check"
+    description: "List of samples that failed the STAR minimum mapped reads threshold specified via the '--min_mapped_reads' parameter, and hence were ignored for the downstream processing steps."
+    plot_type: "table"
+    pconfig:
+      id: "fail_mapped_samples_table"
+      table_title: "Samples failed mapping threshold"
+      namespace: "Samples failed mapping threshold"
+      format: "{:.2f}"
+  fail_strand_check:
+    section_name: "WARNING: Fail Strand Check"
+    description: "List of samples that failed the strandedness check between that provided in the samplesheet and calculated by the <a href='http://rseqc.sourceforge.net/#infer-experiment-py'>RSeQC infer_experiment.py</a> tool."
+    plot_type: "table"
+    pconfig:
+      id: "fail_strand_check_table"
+      table_title: "Samples failed strandedness check"
+      namespace: "Samples failed strandedness check"
+      format: "{:.2f}"
diff --git a/assets/rrna-db-defaults.txt b/assets/rrna-db-defaults.txt
@@ -0,0 +1,8 @@
+https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/rfam-5.8s-database-id98.fasta
+https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/rfam-5s-database-id98.fasta
+https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/silva-arc-16s-id95.fasta
+https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/silva-arc-23s-id98.fasta
+https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/silva-bac-16s-id90.fasta
+https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/silva-bac-23s-id98.fasta
+https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/silva-euk-18s-id95.fasta
+https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/silva-euk-28s-id98.fasta
diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -10,27 +10,37 @@
             "sample": {
                 "type": "string",
                 "pattern": "^\\S+$",
-                "errorMessage": "Sample name must be provided and cannot contain spaces"
+                "errorMessage": "Sample name must be provided and cannot contain spaces",
+                "meta": ["id"]
             },
             "fastq_1": {
                 "type": "string",
+                "format": "file-path",
+                "exists": true,
                 "pattern": "^\\S+\\.f(ast)?q\\.gz$",
                 "errorMessage": "FastQ file for reads 1 must be provided, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'"
             },
             "fastq_2": {
                 "errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'",
+                "type": "string",
+                "format": "file-path",
+                "exists": true,
                 "anyOf": [
                     {
-                        "type": "string",
                         "pattern": "^\\S+\\.f(ast)?q\\.gz$"
                     },
                     {
-                        "type": "string",
                         "maxLength": 0
                     }
                 ]
+            },
+            "strandedness": {
+                "type": "string",
+                "errorMessage": "Strandedness must be provided and be one of 'auto', 'forward', 'reverse' or 'unstranded'",
+                "enum": ["forward", "reverse", "unstranded", "auto"],
+                "meta": ["strandedness"]
             }
         },
-        "required": ["sample", "fastq_1"]
+        "required": ["sample", "fastq_1", "strandedness"]
     }
 }
diff --git a/bin/filter_gtf.py b/bin/filter_gtf.py
@@ -0,0 +1,73 @@
+#!/usr/bin/env python
+
+# Written by Olga Botvinnik with subsequent reworking by Jonathan Manning. Released under the MIT license.
+
+import logging
+import argparse
+import re
+import statistics
+from typing import Set
+
+# Create a logger
+logging.basicConfig(format="%(name)s - %(asctime)s %(levelname)s: %(message)s")
+logger = logging.getLogger("fasta_gtf_filter")
+logger.setLevel(logging.INFO)
+
+
+def extract_fasta_seq_names(fasta_name: str) -> Set[str]:
+    """Extracts the sequence names from a FASTA file."""
+    with open(fasta_name) as fasta:
+        return {line[1:].split(None, 1)[0] for line in fasta if line.startswith(">")}
+
+
+def tab_delimited(file: str) -> float:
+    """Check if file is tab-delimited and return median number of tabs."""
+    with open(file, "r") as f:
+        data = f.read(102400)
+        return statistics.median(line.count("\t") for line in data.split("\n"))
+
+
+def filter_gtf(fasta: str, gtf_in: str, filtered_gtf_out: str, skip_transcript_id_check: bool) -> None:
+    """Filter GTF file based on FASTA sequence names."""
+    if tab_delimited(gtf_in) != 8:
+        raise ValueError("Invalid GTF file: Expected 9 tab-separated columns.")
+
+    seq_names_in_genome = extract_fasta_seq_names(fasta)
+    logger.info(f"Extracted chromosome sequence names from {fasta}")
+    logger.debug("All sequence IDs from FASTA: " + ", ".join(sorted(seq_names_in_genome)))
+
+    seq_names_in_gtf = set()
+    try:
+        with open(gtf_in) as gtf, open(filtered_gtf_out, "w") as out:
+            line_count = 0
+            for line in gtf:
+                seq_name = line.split("\t")[0]
+                seq_names_in_gtf.add(seq_name)  # Add sequence name to the set
+
+                if seq_name in seq_names_in_genome:
+                    if skip_transcript_id_check or re.search(r'transcript_id "([^"]+)"', line):
+                        out.write(line)
+                        line_count += 1
+
+            if line_count == 0:
+                raise ValueError("All GTF lines removed by filters")
+
+    except IOError as e:
+        logger.error(f"File operation failed: {e}")
+        return
+
+    logger.debug("All sequence IDs from GTF: " + ", ".join(sorted(seq_names_in_gtf)))
+    logger.info(f"Extracted {line_count} matching sequences from {gtf_in} into {filtered_gtf_out}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Filters a GTF file based on sequence names in a FASTA file.")
+    parser.add_argument("--gtf", type=str, required=True, help="GTF file")
+    parser.add_argument("--fasta", type=str, required=True, help="Genome fasta file")
+    parser.add_argument("--prefix", dest="prefix", default="genes", type=str, help="Prefix for output GTF files")
+    parser.add_argument(
+        "--skip_transcript_id_check", action="store_true", help="Skip checking for transcript IDs in the GTF file"
+    )
+
+    args = parser.parse_args()
+    filter_gtf(args.fasta, args.gtf, args.prefix + ".filtered.gtf", args.skip_transcript_id_check)