Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Preprocessing from rnaseq #8

Merged
merged 31 commits into from
Jan 29, 2024
Merged
Show file tree
Hide file tree
Changes from 30 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
a3db3ec
Start adding preprocessing components from rnaseq
pinin4fjords Dec 20, 2023
d3be5ed
[skip ci] Unrestrict test resources while I'm mucking about
pinin4fjords Dec 21, 2023
6d96c75
Add gtf to schema
pinin4fjords Dec 21, 2023
c99d042
Reduce kmer size
pinin4fjords Dec 21, 2023
7131650
Trim before attempting strand detection
pinin4fjords Dec 21, 2023
a0f25d6
Add pass_trimmed_reads
pinin4fjords Dec 21, 2023
6c0f58e
Fix lib calls
pinin4fjords Dec 21, 2023
c8309d5
Fix test profile
pinin4fjords Dec 21, 2023
0e243ee
Update conf
pinin4fjords Dec 21, 2023
601aed2
Messy but working rnaseq-style proprocessing
pinin4fjords Dec 22, 2023
216ad1a
Encapsulated preprocessing
pinin4fjords Dec 22, 2023
2647cbc
Borrow more config from rnaseq
pinin4fjords Dec 22, 2023
22922dd
Reset test resources back
pinin4fjords Dec 22, 2023
8c7b0dc
Merge branch 'dev' into preprocessing_from_rnaseq
pinin4fjords Jan 3, 2024
5933b28
Add test_data_base default
pinin4fjords Jan 3, 2024
58ce68c
Fix multiqc config
pinin4fjords Jan 3, 2024
9db1555
Bump to fix subworkflow tests
pinin4fjords Jan 11, 2024
81de6ed
Bump sortmerna
pinin4fjords Jan 11, 2024
81a01a7
Fix HISAT2 tests
pinin4fjords Jan 11, 2024
a4cc7ed
Bump cat/fastq and update config to fix
pinin4fjords Jan 23, 2024
5e33157
Override workflow config for testing fq/subsample
pinin4fjords Jan 23, 2024
b560b6e
Bump fq/subsample
pinin4fjords Jan 23, 2024
0b963ec
Set GFFREAD test args
pinin4fjords Jan 23, 2024
aae53f5
Bump gffread
pinin4fjords Jan 23, 2024
d9c4795
Totally suppress workflow ext.args for GFFREAD
pinin4fjords Jan 23, 2024
08526e6
Run prettier
pinin4fjords Jan 23, 2024
086846e
Arrange subworkflow better
pinin4fjords Jan 23, 2024
8cffca8
Fix relative includes for new subworkflow location
pinin4fjords Jan 23, 2024
bc548a6
Bump outdated modules
pinin4fjords Jan 23, 2024
4b88f15
Update modules/local/gtf2bed/main.nf
pinin4fjords Jan 26, 2024
aeffe21
Apply rnaseq -> riboseq fixes from code review
pinin4fjords Jan 26, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@
"python.linting.flake8Path": "/opt/conda/bin/flake8",
"python.linting.pycodestylePath": "/opt/conda/bin/pycodestyle",
"python.linting.pydocstylePath": "/opt/conda/bin/pydocstyle",
"python.linting.pylintPath": "/opt/conda/bin/pylint"
"python.linting.pylintPath": "/opt/conda/bin/pylint",
},

// Add the IDs of extensions you want installed when the container is created.
"extensions": ["ms-python.python", "ms-python.vscode-pylance", "nf-core.nf-core-extensionpack"]
}
}
"extensions": ["ms-python.python", "ms-python.vscode-pylance", "nf-core.nf-core-extensionpack"],
},
},
}
170 changes: 168 additions & 2 deletions assets/multiqc_config.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
report_comment: >
This report has been generated by the <a href="https://github.com/nf-core/riboseq/tree/dev" target="_blank">nf-core/riboseq</a>
analysis pipeline. For information about how to interpret these results, please see the

This report has been generated by the <a href="https://github.com/nf-core/riboseq/tree/dev" target="_blank">nf-core/riboseq</a> analysis pipeline. For information about how to interpret these results, please see the
<a href="https://nf-co.re/riboseq/dev/docs/output" target="_blank">documentation</a>.
report_section_order:
"nf-core-riboseq-methods-description":
Expand All @@ -11,3 +11,169 @@ report_section_order:
order: -1002

export_plots: true

# Run only these modules
run_modules:
- custom_content
- fastqc
- cutadapt
- fastp
- sortmerna
- star
- hisat2
- rsem
- salmon
- kallisto
- samtools
- picard
- preseq
- rseqc
- qualimap

# Order of modules
top_modules:
- "fail_trimmed_samples"
- "fail_mapped_samples"
- "fail_strand_check"
- "star_rsem_deseq2_pca"
- "star_rsem_deseq2_clustering"
- "star_salmon_deseq2_pca"
- "star_salmon_deseq2_clustering"
- "salmon_deseq2_pca"
- "salmon_deseq2_clustering"
- "biotype_counts"
- "dupradar"

module_order:
- fastqc:
name: "FastQC (raw)"
info: "This section of the report shows FastQC results before adapter trimming."
path_filters:
- "./fastqc/raw/*.zip"
- cutadapt
- fastp
- fastqc:
name: "FastQC (trimmed)"
info: "This section of the report shows FastQC results after adapter trimming."
path_filters:
- "./fastqc/trim/*.zip"

# Don't show % Dups in the General Stats table (we have this from Picard)
table_columns_visible:
fastqc:
percent_duplicates: False

extra_fn_clean_exts:
- ".umi_dedup"
- "_val"
- ".markdup"
- "_primary"

# Customise the module search patterns to speed up execution time
# - Skip module sub-tools that we are not interested in
# - Replace file-content searching with filename pattern searching
# - Don't add anything that is the same as the MultiQC default
# See https://multiqc.info/docs/#optimise-file-search-patterns for details
sp:
cutadapt:
fn: "*trimming_report.txt"

fastp:
fn: "*.fastp.json"

sortmerna:
fn: "*.sortmerna.log"

hisat2:
fn: "*.hisat2.summary.log"

salmon/meta:
fn: "meta_info.json"

preseq:
fn: "*.lc_extrap.txt"

samtools/stats:
fn: "*.stats"
samtools/flagstat:
fn: "*.flagstat"
samtools/idxstats:
fn: "*.idxstats*"

rseqc/bam_stat:
fn: "*.bam_stat.txt"
rseqc/gene_body_coverage:
skip: true
rseqc/junction_annotation:
fn: "*.junction_annotation.log"
rseqc/read_gc:
skip: true
rseqc/read_distribution:
fn: "*.read_distribution.txt"
rseqc/infer_experiment:
fn: "*.infer_experiment.txt"
rseqc/tin:
fn: "*.summary.txt"

picard/markdups:
fn: "*.MarkDuplicates.metrics.txt"
picard/alignment_metrics:
skip: true
picard/basedistributionbycycle:
skip: true
picard/gcbias:
skip: true
picard/hsmetrics:
skip: true
picard/insertsize:
skip: true
picard/oxogmetrics:
skip: true
picard/pcr_metrics:
skip: true
picard/quality_by_cycle:
skip: true
picard/quality_score_distribution:
skip: true
picard/quality_yield_metrics:
skip: true
picard/rnaseqmetrics:
skip: true
picard/rrbs_metrics:
skip: true
picard/sam_file_validation:
skip: true
picard/variant_calling_metrics:
skip: true
picard/wgs_metrics:
skip: true

# See https://github.com/ewels/MultiQC_TestData/blob/master/data/custom_content/with_config/table_headerconfig/multiqc_config.yaml
custom_data:
fail_trimmed_samples:
section_name: "WARNING: Fail Trimming Check"
description: "List of samples that failed the minimum trimmed reads threshold specified via the '--min_trimmed_reads' parameter, and hence were ignored for the downstream processing steps."
plot_type: "table"
pconfig:
id: "fail_trimmed_samples_table"
table_title: "Samples failed trimming threshold"
namespace: "Samples failed trimming threshold"
format: "{:.0f}"
fail_mapped_samples:
section_name: "WARNING: Fail Alignment Check"
description: "List of samples that failed the STAR minimum mapped reads threshold specified via the '--min_mapped_reads' parameter, and hence were ignored for the downstream processing steps."
plot_type: "table"
pconfig:
id: "fail_mapped_samples_table"
table_title: "Samples failed mapping threshold"
namespace: "Samples failed mapping threshold"
format: "{:.2f}"
fail_strand_check:
section_name: "WARNING: Fail Strand Check"
description: "List of samples that failed the strandedness check between that provided in the samplesheet and calculated by the <a href='http://rseqc.sourceforge.net/#infer-experiment-py'>RSeQC infer_experiment.py</a> tool."
plot_type: "table"
pconfig:
id: "fail_strand_check_table"
table_title: "Samples failed strandedness check"
namespace: "Samples failed strandedness check"
format: "{:.2f}"
8 changes: 8 additions & 0 deletions assets/rrna-db-defaults.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/rfam-5.8s-database-id98.fasta
https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/rfam-5s-database-id98.fasta
https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/silva-arc-16s-id95.fasta
https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/silva-arc-23s-id98.fasta
https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/silva-bac-16s-id90.fasta
https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/silva-bac-23s-id98.fasta
https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/silva-euk-18s-id95.fasta
https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/silva-euk-28s-id98.fasta
18 changes: 14 additions & 4 deletions assets/schema_input.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,27 +10,37 @@
"sample": {
"type": "string",
"pattern": "^\\S+$",
"errorMessage": "Sample name must be provided and cannot contain spaces"
"errorMessage": "Sample name must be provided and cannot contain spaces",
"meta": ["id"]
},
"fastq_1": {
"type": "string",
"format": "file-path",
"exists": true,
"pattern": "^\\S+\\.f(ast)?q\\.gz$",
"errorMessage": "FastQ file for reads 1 must be provided, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'"
},
"fastq_2": {
"errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'",
"type": "string",
"format": "file-path",
"exists": true,
"anyOf": [
{
"type": "string",
"pattern": "^\\S+\\.f(ast)?q\\.gz$"
},
{
"type": "string",
"maxLength": 0
}
]
},
"strandedness": {
"type": "string",
"errorMessage": "Strandedness must be provided and be one of 'auto', 'forward', 'reverse' or 'unstranded'",
"enum": ["forward", "reverse", "unstranded", "auto"],
"meta": ["strandedness"]
}
},
"required": ["sample", "fastq_1"]
"required": ["sample", "fastq_1", "strandedness"]
}
}
73 changes: 73 additions & 0 deletions bin/filter_gtf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#!/usr/bin/env python

# Written by Olga Botvinnik with subsequent reworking by Jonathan Manning. Released under the MIT license.

import logging
import argparse
import re
import statistics
from typing import Set

# Create a logger
logging.basicConfig(format="%(name)s - %(asctime)s %(levelname)s: %(message)s")
logger = logging.getLogger("fasta_gtf_filter")
logger.setLevel(logging.INFO)


def extract_fasta_seq_names(fasta_name: str) -> Set[str]:
"""Extracts the sequence names from a FASTA file."""
with open(fasta_name) as fasta:
return {line[1:].split(None, 1)[0] for line in fasta if line.startswith(">")}


def tab_delimited(file: str) -> float:
"""Check if file is tab-delimited and return median number of tabs."""
with open(file, "r") as f:
data = f.read(102400)
return statistics.median(line.count("\t") for line in data.split("\n"))


def filter_gtf(fasta: str, gtf_in: str, filtered_gtf_out: str, skip_transcript_id_check: bool) -> None:
"""Filter GTF file based on FASTA sequence names."""
if tab_delimited(gtf_in) != 8:
raise ValueError("Invalid GTF file: Expected 9 tab-separated columns.")

seq_names_in_genome = extract_fasta_seq_names(fasta)
logger.info(f"Extracted chromosome sequence names from {fasta}")
logger.debug("All sequence IDs from FASTA: " + ", ".join(sorted(seq_names_in_genome)))

seq_names_in_gtf = set()
try:
with open(gtf_in) as gtf, open(filtered_gtf_out, "w") as out:
line_count = 0
for line in gtf:
seq_name = line.split("\t")[0]
seq_names_in_gtf.add(seq_name) # Add sequence name to the set

if seq_name in seq_names_in_genome:
if skip_transcript_id_check or re.search(r'transcript_id "([^"]+)"', line):
out.write(line)
line_count += 1

if line_count == 0:
raise ValueError("All GTF lines removed by filters")

except IOError as e:
logger.error(f"File operation failed: {e}")
return

logger.debug("All sequence IDs from GTF: " + ", ".join(sorted(seq_names_in_gtf)))
logger.info(f"Extracted {line_count} matching sequences from {gtf_in} into {filtered_gtf_out}")


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Filters a GTF file based on sequence names in a FASTA file.")
parser.add_argument("--gtf", type=str, required=True, help="GTF file")
parser.add_argument("--fasta", type=str, required=True, help="Genome fasta file")
parser.add_argument("--prefix", dest="prefix", default="genes", type=str, help="Prefix for output GTF files")
parser.add_argument(
"--skip_transcript_id_check", action="store_true", help="Skip checking for transcript IDs in the GTF file"
)

args = parser.parse_args()
filter_gtf(args.fasta, args.gtf, args.prefix + ".filtered.gtf", args.skip_transcript_id_check)
Loading
Loading