From 9c3219086104c480abef86ad31395e586e41a807 Mon Sep 17 00:00:00 2001 From: WackerO Date: Thu, 15 Jun 2023 07:45:31 +0200 Subject: [PATCH 01/30] Saving some progress, REMOVE pxnotebook_env.yml AND Dockerfilegit add modules/nf-core/proteus/ pxnotebook_env.yml Dockerfile! --- CHANGELOG.md | 2 - Dockerfile | 14 + assets/differentialabundance_report.Rmd | 9 +- conf/modules.config | 2 +- docs/output.md | 60 +- docs/usage.md | 2 +- .../limma/differential/templates/limma_de.R | 3 +- modules/nf-core/proteus/main.nf | 31 + modules/nf-core/proteus/meta.yml | 60 ++ .../templates/proteus_readproteingroups.R | 542 ++++++++++++++++++ modules/nf-core/rmarkdownnotebook/main.nf | 4 +- modules/nf-core/shinyngs/app/main.nf | 10 +- nextflow.config | 4 - nextflow_schema.json | 14 +- pxnotebook_env.yml | 21 + workflows/differentialabundance.nf | 80 ++- 16 files changed, 750 insertions(+), 108 deletions(-) create mode 100644 Dockerfile create mode 100644 modules/nf-core/proteus/main.nf create mode 100644 modules/nf-core/proteus/meta.yml create mode 100755 modules/nf-core/proteus/templates/proteus_readproteingroups.R create mode 100644 pxnotebook_env.yml diff --git a/CHANGELOG.md b/CHANGELOG.md index 960f42e5..61a55af5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,8 +9,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [[#124](https://github.com/nf-core/differentialabundance/pull/124)] - Template update for nf-core/tools v2.8 ([@pinin4fjords](https://github.com/pinin4fjords), review by [@jasmezz](https://github.com/jasmezz)) - [[#129](https://github.com/nf-core/differentialabundance/pull/129)] - Module updates to fit with recent registry changes ([@pinin4fjords](https://github.com/pinin4fjords), review by [@maxulysse](https://github.com/maxulysse), [@adamrtalbot](https://github.com/adamrtalbot)) -- [[#130](https://github.com/nf-core/differentialabundance/pull/130)] - Document reasons for lack of differential expression ([@pinin4fjords](https://github.com/pinin4fjords), review by [@jfy133](https://github.com/jfy133)) -- [[#131](https://github.com/nf-core/differentialabundance/pull/131)] - Improve gtf to table configurability ([@pinin4fjords](https://github.com/pinin4fjords), review by [@WackerO](https://github.com/WackerO)) ### `Fixed` diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..9ad9f84c --- /dev/null +++ b/Dockerfile @@ -0,0 +1,14 @@ +FROM condaforge/mambaforge +COPY pxnotebook_env.yml / +#RUN conda install -c conda-forge mamba +RUN mamba env create --file /pxnotebook_env.yml -p /opt/conda/envs/pxnotebook && \ + mamba clean --all --yes +RUN apt-get update -qq && \ + apt-get install -y zip procps ghostscript +# Add conda installation dir to PATH +ENV PATH /opt/conda/envs/pxnotebook/bin:$PATH +# Dump the details of the installed packates to a file for posterity +RUN mamba env export --name pxnotebook > pxnotebook.yml +# Instruct R processes to use these empty files instead of clashing with a local config +RUN touch .Rprofile +RUN touch .Renviron diff --git a/assets/differentialabundance_report.Rmd b/assets/differentialabundance_report.Rmd index a0977934..1b460a1a 100644 --- a/assets/differentialabundance_report.Rmd +++ b/assets/differentialabundance_report.Rmd @@ -33,8 +33,6 @@ params: features_id_col: NULL features_name_col: NULL features_metadata_cols: NULL - features_gtf_feature_type: NULL - features_gtf_table_first_field: NULL raw_matrix: null # e.g. 0_salmon.merged.gene_counts.tsv normalised_matrix: null variance_stabilised_matrix: null # e.g. test_files/3_treatment-WT-P23H.vst.tsv @@ -241,7 +239,7 @@ assay_data <- lapply(assay_files, function(x) { colnames(mat) <- observations[[params$observations_name_col]][match(colnames(mat), rownames(observations))] # Bit hacky, but ensure log - if (max(mat) > 20){ + if (max(mat, na.rm=T) > 20){ log2(mat+1) }else{ mat @@ -254,6 +252,8 @@ rownames(observations) <- observations[[params$observations_name_col]] # Run PCA early so we can understand how important each variable is pca_datas <- lapply(names(assay_data), function(assay_type){ + capture.output(assay_data[[assay_type]], file=paste0("/home-link/iivow01/git/differentialabundance/error/test_assaydatatypebla.txt")) #, quote=F, sep="\t" + compilePCAData(assay_data[[assay_type]]) }) names(pca_datas) <- names(assay_data) @@ -579,7 +579,8 @@ for (assay_type in rev(names(assay_data))){ variable_genes <- selectVariableGenes(matrix = assay_data[[assay_type]], ntop = params$exploratory_n_features) dendroColorScale <- makeColorScale(length(unique(observations[[iv]])), palette = params$exploratory_palette_name) - + capture.output(assay_data[[assay_type]][variable_genes, ], file=paste0("/home-link/iivow01/git/differentialabundance/error2/dendrostuff_test_", assay_type, "_", iv)) + p <- clusteringDendrogram( 2^assay_data[[assay_type]][variable_genes, ], observations[, iv, drop = FALSE], diff --git a/conf/modules.config b/conf/modules.config index bdd1ade9..625683e7 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -32,7 +32,7 @@ process { pattern: '*.anno.tsv' ] ] - ext.args = "--feature-type '${params.features_gtf_feature_type}' --first-field '${params.features_gtf_table_first_field}'" + ext.args = "--feature-type transcript" } withName: VALIDATOR { diff --git a/docs/output.md b/docs/output.md index f93e90be..4bf07ea8 100644 --- a/docs/output.md +++ b/docs/output.md @@ -68,18 +68,16 @@ The `differential` folder is likely to be the core result set for most users, co ## Shiny app -
-Output files - - `shinyngs_app/` - `[study name]`: - `data.rds`: serialized R object which can be used to generate a Shiny application - `app.R`: minimal R script that will source the data object and generate the app -
- The app must be run in an environment with [ShinyNGS](https://github.com/pinin4fjords/shinyngs) installed, or you can see the workflow parameters to deploy to shinyapps.io (see usage documentation). +
+Output files + ### Pipeline information
@@ -93,55 +91,3 @@ The app must be run in an environment with [ShinyNGS](https://github.com/pinin4f
[Nextflow](https://www.nextflow.io/docs/latest/tracing.html) provides excellent functionality for generating various reports relevant to the running and execution of the pipeline. This will allow you to troubleshoot errors with the running of the pipeline, and also provide you with other information such as launch commands, run times and resource usage. - -## Frequently asked questions - -### Why are no genes flagged as differentially expressed? - -#### 1. Low replication: - -**Problem:** The number of replicates in your RNA-seq experiment may be insufficient to detect statistically significant differential expression. - -**Suggested course of action:** Consider increasing the number of replicates to improve the statistical power of your analysis. Repeating the experiment with greater replication allows for better estimation of biological variation and increases the chances of observing significant differential expression. Consult with experimental design experts or statisticians to determine the appropriate sample size calculation based on your specific research question and resources. - -#### 2. Subtle effect: - -**Problem:** The experimental intervention may have a relatively subtle impact on gene expression, making it challenging to detect differential expression using default thresholds. - -**Suggested course of action:** Adjust the analysis parameters to improve sensitivity in capturing smaller changes in gene expression. Try reducing the `differential_min_fold_change` parameter to include genes with smaller fold changes. Additionally, consider increasing the `differential_max_qval` parameter to relax the significance threshold and capture a broader range of significant p-values or q-values. By fine-tuning these parameters, you increase the likelihood of identifying genes with subtle but biologically relevant changes in expression. - -#### 3. Genuinely no differential expression: - -**Problem:** It is possible that the experimental intervention has not significantly impacted gene expression, resulting in the absence of differentially expressed genes. - -**Suggested course of action:** Evaluate the experimental design and the perturbation itself. If the intervention is expected to induce changes in gene expression but no differential expression is observed, revisit the experimental design, biological perturbation, or underlying hypothesis. Consider reassessing the experimental conditions or exploring alternative approaches to investigate other aspects of the biological system. - -#### 4. Unaccounted sources of variance: - -**Problem:** Other factors outside the main treatment may introduce variance in gene expression, leading to a decrease in power to detect differential expression. - -**Suggested course of action:** Examine the PCA (Principal Component Analysis) and metadata association plots generated by the workflow. Identify variables associated with components that contribute significantly to the variance in your data. Include these variables as covariates in the contrasts table's blocking column to account for their effects on gene expression. By incorporating these unaccounted sources of variance into your analysis, you improve the accuracy and power to detect differential expression. - -#### 5. Biological complexity and pathway-level effects: - -**Problem:** The experimental intervention may not lead to observable differential expression at the individual gene level, but there may be coordinated changes at the pathway or functional level. - -**Suggested course of action:** Utilize pathway analysis tools such as Gene Set Enrichment Analysis (GSEA), available in this workflow. These tools evaluate the enrichment of gene sets or functional annotations to identify broader biological processes influenced by the experimental intervention. By focusing on pathway-level analysis, you can capture the overall impact of the intervention on biological processes, even if differential expression at the individual gene level is not apparent. - -#### 6. Limited options for normalization: - -**Problem:** The nf-core differential abundance workflow currently offers a limited set of normalization methods, which may not fully address the specific normalization requirements of your experiment. - -**Suggested course of action:** If the existing options do not adequately address your experiment's normalization challenges, consider developing custom normalization modules tailored to your needs. By contributing these modules to the nf-core community, you can expand the range of normalization options available to researchers. Your contributions will help researchers in similar situations and contribute to the continuous improvement and customization of the workflow. - -#### 7. Technical variability and batch effects: - -**Problem:** Technical variability and batch effects can introduce noise and confound the detection of differential expression. - -**Suggested course of action:** Address technical variability and batch effects in the experimental design and data analysis. Randomize sample collection, incorporate control samples, and balance samples across different experimental batches. These measures minimize technical variation, enhance the robustness of the analysis, and increase the chances of detecting true differential expression. - -#### 8. Workflow issues or bugs: - -**Problem:** Potential issues or bugs in the nf-core differential abundance workflow can affect the detection of differential expression or data analysis. - -**Suggested course of action:** Report any issues or suspected bugs by opening an issue on the [nf-core differential abundance workflow repository](https://github.com/nf-core/differentialabundance). Provide specific details, such as software versions, error messages, and relevant data or code snippets. Your feedback is valuable for improving the workflow's reliability. If you have the technical expertise, consider contributing to the workflow by submitting pull requests to address issues, fix bugs, or propose enhancements. diff --git a/docs/usage.md b/docs/usage.md index a0101cac..b97a7fee 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -181,7 +181,7 @@ The typical command for running the pipeline is as follows: ```bash nextflow run nf-core/differentialabundance \ - [--profile rnaseq OR -profile affy] \ + [-profile rnaseq OR -profile affy] \ --input samplesheet.csv \ --contrasts contrasts.csv \ [--matrix assay_matrix.tsv OR --affy_cel_files_archive cel_files.tar] \ diff --git a/modules/nf-core/limma/differential/templates/limma_de.R b/modules/nf-core/limma/differential/templates/limma_de.R index 47d0424f..cdcf14f3 100755 --- a/modules/nf-core/limma/differential/templates/limma_de.R +++ b/modules/nf-core/limma/differential/templates/limma_de.R @@ -147,7 +147,8 @@ library(limma) ## READ IN COUNTS FILE AND SAMPLE METADATA ## ################################################ ################################################ - +file.copy(opt\$count_file, "/home-link/iivow01/git/differentialabundance/error2/counts") +file.copy(opt\$sample_file, "/home-link/iivow01/git/differentialabundance/error2/sample_file") intensities.table <- read_delim_flexible( file = opt\$count_file, diff --git a/modules/nf-core/proteus/main.nf b/modules/nf-core/proteus/main.nf new file mode 100644 index 00000000..3d977ea5 --- /dev/null +++ b/modules/nf-core/proteus/main.nf @@ -0,0 +1,31 @@ +process PROTEUS { + tag "$meta" + label 'process_medium' +//TODO: Change containers +// conda "proteus2" +// conda "bioconda::r-proteus-bartongroup=0.2.16 conda-forge::r-plotly=4.10.1 bioconda::bioconductor-limma=3.54.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/r-proteus-bartongroup:0.2.16--r42hdfd78af_0' : + 'quay.io/biocontainers/mulled-v2-315db18c8d78a415a01c6264de61a7063523d1a0:e1c1e17f1fcd8a42a94770f3ebe242c6715270f8-0' }" + + input: + tuple val(meta), path(samplesheet), path(quants) + + output: + tuple val(meta), path("*normalised_distributions.png") , emit: nonnorm_dist_plot + tuple val(meta), path("*normalised_distributions.png") , emit: norm_dist_plot + tuple val(meta), path("*mean_variance_relationship.png") , emit: mean_var_relationship_plot + tuple val(meta), path("*dendrogram.png") , emit: dendro_plot + tuple val(meta), path("*raw_proteingroups.rds") , emit: rdata + tuple val(meta), path("*raw_proteingroups_tab.tsv") , emit: tab + tuple val(meta), path("*normalised_proteingroups_tab.tsv") , emit: normtab +// tuple val(meta), path("*normalised_proteingroups_tab2.tsv") , emit: normtab2 + tuple val(meta), path("*R_sessionInfo.log") , emit: session_info +// path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + template 'proteus_readproteingroups.R' +} diff --git a/modules/nf-core/proteus/meta.yml b/modules/nf-core/proteus/meta.yml new file mode 100644 index 00000000..99f0bb0d --- /dev/null +++ b/modules/nf-core/proteus/meta.yml @@ -0,0 +1,60 @@ +name: "limma_differential" +description: runs a differential expression analysis with Limma +keywords: + - differential + - expression + - microarray + - limma + +tools: + - "limma": + description: "Linear Models for Microarray Data" + homepage: "https://bioconductor.org/packages/release/bioc/html/limma.html" + documentation: "https://bioconductor.org/packages/release/bioc/vignettes/limma/inst/doc/usersguide.pdf" + tool_dev_url: https://github.com/cran/limma"" + doi: "10.18129/B9.bioc.limma" + licence: "['LGPL >=3']" + +input: + - meta: + type: map + description: | + Groovy Map containing contrast information, which can be referred to in + calls at the pipeline level e.g. [ variable:'treatment', reference:'treated', + control:'saline', blocking:'' ] passed in as ext.args like: '--reference_level + $meta.reference --treatment_level $meta.target' + - samplesheeet: + type: file + description: | + CSV or TSV format sample sheet with sample metadata + - intensities: + type: file + description: | + Raw TSV or CSV format expression matrix with probes by row and samples + by column + +output: + - results: + type: file + description: TSV-format table of differential expression information as + output by Limma + pattern: "*.limma.results.tsv" + - md_plot: + type: file + description: Limma mean difference plot + pattern: "*.mean_difference.png" + - rdata: + type: file + description: Serialised MArrayLM object + pattern: "*.MArrayLM.limma.rds" + - session_info: + type: file + description: dump of R SessionInfo + pattern: "*.log" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@pinin4fjords" diff --git a/modules/nf-core/proteus/templates/proteus_readproteingroups.R b/modules/nf-core/proteus/templates/proteus_readproteingroups.R new file mode 100755 index 00000000..ec9b0a68 --- /dev/null +++ b/modules/nf-core/proteus/templates/proteus_readproteingroups.R @@ -0,0 +1,542 @@ +#!/usr/bin/env Rscript + + + + +customreadEvidenceFile <- function(file, measure.cols=measureColumns, data.cols=evidenceColumns, zeroes.are.missing=TRUE) { + + columns <- c(data.cols, measure.cols) + #write(columns, file="/home/iivow01/git/differentialabundance/error/columns") + write(anyDuplicated(names(columns)), file="/home/iivow01/git/differentialabundance/error/wiebitte", append=F) + write(((columns)), file="/home/iivow01/git/differentialabundance/error/wiebitte", append=T) + if ("\n" %in% names(columns)) { + write("wat", file="/home/iivow01/git/differentialabundance/error/wat") + } + if(anyDuplicated(names(columns))){ + dupcols <- names(columns)[duplicated(names(columns))] + err <- paste("Column names must be unique. Got the following duplicate columns:", dupcols) + write(err, file="/home/iivow01/git/differentialabundance/error/err") + write( names(columns), file="/home/iivow01/git/differentialabundance/error/namescolumns") + #stop(err) + } + + # check if all required columns are in the evidence file + evi.cols <- read.delim(file, header=TRUE, sep="\t", check.names=FALSE, as.is=TRUE, strip.white=TRUE, nrows = 1) + missing <- NULL + for(col in columns) { + if(!(col %in% colnames(evi.cols))) missing <- c(missing, paste0("'", col, "'")) + } + if(!is.null(missing)) + stop(paste0("Column(s) ", paste0(missing, collapse=", "), " not found in file ", file)) + + # read and process evidence file + evi <- read.delim(file, header=TRUE, sep="\t", check.names=FALSE, as.is=TRUE, strip.white=TRUE) + evi <- evi[, as.character(columns)] + names(evi) <- names(columns) + # replace NaNs and infinites with NAs in measure columns + # the same with zeroes if flag is on + for(col in names(measure.cols)) { + x <- evi[, col] + x[is.nan(x) | is.infinite(x)] <- NA + if(zeroes.are.missing) x[x == 0] <- NA + evi[, col] <- x + } + + # remove rows that have only NAs in measure columns + not.empty <- which(rowSums(!is.na(evi[,names(measure.cols), drop=FALSE])) > 0) + evi <- evi[not.empty,] +} + + + + + + + + + + +################################################ +################################################ +## Functions ## +################################################ +################################################ + +#' Parse out options from a string without recourse to optparse +#' +#' @param x Long-form argument list like --opt1 val1 --opt2 val2 +#' +#' @return named list of options and values similar to optparse + +parse_args <- function(x){ + args_list <- unlist(strsplit(x, ' ?--')[[1]])[-1] + args_vals <- lapply(args_list, function(x) scan(text=x, what='character', quiet = TRUE)) + + # Ensure the option vectors are length 2 (key/ value) to catch empty ones + args_vals <- lapply(args_vals, function(z){ length(z) <- 2; z}) + + parsed_args <- structure(lapply(args_vals, function(x) x[2]), names = lapply(args_vals, function(x) x[1])) + parsed_args[! is.na(parsed_args)] +} + +#' Flexibly read CSV or TSV files +#' +#' @param file Input file +#' @param header Passed to read.delim() +#' @param row.names Passed to read.delim() +#' +#' @return output Data frame + +read_delim_flexible <- function(file, header = TRUE, row.names = NULL, check.names = F){ + + ext <- tolower(tail(strsplit(basename(file), split = "\\\\.")[[1]], 1)) + + if (ext == "tsv" || ext == "txt") { + separator <- "\\t" + } else if (ext == "csv") { + separator <- "," + } else { + stop(paste("Unknown separator for", ext)) + } + + read.delim( + file, + sep = separator, + header = header, + row.names = row.names, + check.names = check.names + ) +} + +#' Round numeric dataframe columns to fixed decimal places by applying +#' formatting and converting back to numerics +#' +#' @param dataframe A data frame +#' @param columns Which columns to round (assumes all of them by default) +#' @param digits How many decimal places to round to? +#' +#' @return output Data frame +# TODO check if this is necessary +round_dataframe_columns <- function(df, columns = NULL, digits = 8){ + if (is.null(columns)){ + columns <- colnames(df) + } + + df[,columns] <- format( + data.frame(df[, columns], check.names = FALSE), + nsmall = digits + ) + + # Convert columns back to numeric + + for (c in columns) { + df[[c]][grep("^ *NA\$", df[[c]])] <- NA + df[[c]] <- as.numeric(df[[c]]) + } + df +} + +################################################ +################################################ +## PARSE PARAMETERS FROM NEXTFLOW ## +################################################ +################################################ + +# I've defined these in a single array like this so that we could go back to an +# optparse-driven method in future with module bin/ directories, rather than +# the template + +# Set defaults and classes + +opt <- list( + quant_file = '$quants', + sample_file = '$samplesheet', + contrast_variable = NULL, + sample_id_col = 'sample', + measure_col_prefix = 'Intensity', + normfuns = 'normalizeMedian', + plotSampleDistributions_method = 'violin', + plotMV_loess = T, + palette_name = 'Set1' +) +opt_types <- lapply(opt, class) + +# Apply parameter overrides + +args_opt <- parse_args('$task.ext.args') +for ( ao in names(args_opt)){ + if (! ao %in% names(opt)){ + stop(paste("Invalid option:", ao)) + }else{ + + # Preserve classes from defaults where possible + if (! is.null(opt[[ao]])){ + args_opt[[ao]] <- as(args_opt[[ao]], opt_types[[ao]]) + } + opt[[ao]] <- args_opt[[ao]] + } +} + +# Check if required parameters have been provided + +required_opts <- c('quant_file', 'sample_file', 'contrast_variable') +missing <- required_opts[unlist(lapply(opt[required_opts], is.null)) | ! required_opts %in% names(opt)] + +if (length(missing) > 0){ + stop(paste("Missing required options:", paste(missing, collapse=', '))) +} + +# Check file inputs are valid + +for (file_input in c('quant_file', 'sample_file')){ + if (is.null(opt[[file_input]])) { + stop(paste("Please provide", file_input), call. = FALSE) + } + + if (! file.exists(opt[[file_input]])){ + stop(paste0('Value of ', file_input, ': ', opt[[file_input]], ' is not a valid file')) + } +} + +################################################ +################################################ +## Finish loading libraries ## +################################################ +################################################ +mytmp <- tempdir() + + +# if (!requireNamespace("BiocManager", quietly = TRUE)) { +# install.packages("BiocManager",repos = "http://cran.us.r-project.org", lib=mytmp) +# } +# library("BiocManager", lib.loc=mytmp) +# if (!require("rmarkdown")){ +# BiocManager::install("rmarkdown", lib=mytmp) +# library("rmarkdown", lib.loc=mytmp) +# } +# if (!require("miniUI")){ +# BiocManager::install("miniUI", lib=mytmp) +# library("miniUI", lib.loc=mytmp) +# } +# if (!require("pkgdown")){ +# BiocManager::install("pkgdown", lib=mytmp) +# library("pkgdown", lib.loc=mytmp) +# } +# if (!require("devtools")){ +# BiocManager::install("devtools", lib=mytmp) +# library("devtools", lib.loc=mytmp) +# } +# if (!require("limma")){ +# BiocManager::install("limma", lib=mytmp) +# library("limma", lib.loc=mytmp) +# } +# if (!require("ggplot2")){ +# BiocManager::install("ggplot2", lib=mytmp) +# library("ggplot2", lib.loc=mytmp) +# } +# if (!require("ggplot2")) { +# install.packages("ggplot2",repos = "http://cran.us.r-project.org", lib=mytmp) +# library("ggplot2", lib.loc=mytmp) +# } +# devtools::install_github("tidyverse/ggplot2", lib=mytmp) +# library("ggplot2", lib.loc=mytmp) + + +# if (!require("plotly")){ +# BiocManager::install("plotly", lib=mytmp) +# library("plotly", lib.loc=mytmp) +# } +# if (!require("proteus")){ +# devtools::install_github("bartongroup/Proteus", lib=mytmp, +# build_opts= c("--no-resave-data", "--no-manual"), build_vignettes=FALSE) +# library("proteus", lib.loc=mytmp) +# } + + +library(limma) +library(plotly) +library(proteus) +assignInNamespace("readEvidenceFile", customreadEvidenceFile, ns = "proteus") +################################################ +################################################ +## READ IN COUNTS FILE AND SAMPLE METADATA ## +################################################ +################################################ +#opt\$quant_file, +quant.table <- + read_delim_flexible( + file = opt\$quant_file, + check.names = FALSE, + row.names = 1 + ) +write.table(quant.table, file="/home-link/iivow01/git/differentialabundance/error/quanttablllllllllle.tsv", sep="\t") +sample.sheet <- + read_delim_flexible( + file = opt\$sample_file, + check.names=FALSE + ) + +# Deal with spaces that may be in sample column +#opt\$sample_id_col <- make.names(opt\$sample_id_col) + +if (! opt\$sample_id_col %in% colnames(sample.sheet)){ + stop(paste0("Specified sample ID column '", opt\$sample_id_col, "' is not in the sample sheet")) +} + +# Add metadata columns that are necessary for proteus + +sample.sheet\$sample <- sample.sheet[[opt\$sample_id_col]] + + + +#opt\$contrast_variable <- make.names(opt\$contrast_variable) +sample.sheet\$condition <- sample.sheet[[opt\$contrast_variable]] + +# Add prefix for proteinGroups measurement columns to the sample IDs from the sampesheet +measure.cols <- setNames(paste0(opt\$measure_col_prefix, sample.sheet[[opt\$sample_id_col]]), sample.sheet[[opt\$sample_id_col]]) +#measure.cols <- paste0(opt\$measure_col_prefix, sample.sheet[[opt\$sample_id_col]]) +#names(measure.cols) <- sample.sheet[[opt\$sample_id_col]] +write(measure.cols, file="/home/iivow01/git/differentialabundance/error/measurecols") +write(names(measure.cols), file="/home/iivow01/git/differentialabundance/error/measurecolsnames") +# TODO check if this can happen for proteingroups +# Sample sheet can have duplicate rows for multiple sequencing runs, so uniqify +# before assigning row names + +#sample.sheet <- sample.sheet[! duplicated(sample.sheet[[opt\$sample_id_col]]), ] +#rownames(sample.sheet) <- sample.sheet[[opt\$sample_id_col]] + +# Check that all samples specified in the input sheet are present in the quants +# table + +missing_columns <- paste0(opt\$measure_col_prefix, sample.sheet[[opt\$sample_id_col]]) +missing_columns <- missing_columns[!missing_columns %in% colnames(quant.table)] +#missing_samples <- +# (sample.sheet[[opt\$sample_id_col]])[!missing_columns %in% colnames(quant.table)] + +# TODO: Consider if this auto-filter should be kept or removed (probably removed, otherwise I also have to deal with makenames) +#sample.sheet <- sample.sheet[!(rownames(sample.sheet) %in% missing_samples),] + +write(missing_columns, file="/home-link/iivow01/git/differentialabundance/error2/samplecols") +#write(missing_samples, file="/home-link/iivow01/git/differentialabundance/error2/missing_samples") + +if (length(missing_columns) > 0) { + stop(paste( + length(missing_columns), + 'specified samples do not have a(n)', + opt\$measure_col_prefix, + 'column in quant table. The following columns are missing:', + paste(missing_columns, collapse = ', ') + )) +} else{ + # Save any non-quant data, with gene metadata etc we might need later + # TODO: Maybe just save the whole quant file? (or not; not sure the rest is ever needed) + nonquant.table <- + quant.table[, !colnames(quant.table) %in% paste0(opt\$measure_col_prefix, sample.sheet[[opt\$sample_id_col]]), drop = FALSE] +} + +################################################ +################################################ +## CHECK AND FORMAT NORMFUN AND FILTERFUN ## +################################################ +################################################ + +valid_normfuns <- c("normalizeMedian", "normalizeQuantiles") +normfuns <- opt\$normfuns + +# Check validity of normfun(s) +invalid_normfuns <- normfuns[!(normfuns %in% valid_normfuns)] +if (length(invalid_normfuns)>0) { + stop(paste0("Invalid normfuns argument(s): ", + paste(invalid_normfuns, collapse=", "), + ". Valid normfuns are: ", + paste(valid_normfuns, collapse=", "), + ".")) +} + +################################################ +################################################ +## Run Proteus processes and generate outputs ## +################################################ +################################################ + +# TODO +output_prefix <- "output_prefix" + +# TODO: Add link to https://rdrr.io/github/bartongroup/Proteus/man/readProteinGroups.html to docu and mention the necessary columns! +write.table(read.table(opt\$quant_file, sep="\t", header=T, check.names=F), file="/home/iivow01/git/differentialabundance/error/wtf.tsv", quote=F, sep="\t") + + +proteinColumns <- proteus::proteinColumns +capture.output((proteinColumns), file="/home/iivow01/git/differentialabundance/error/protcol") +#write.table(quant.table, file="/home-link/iivow01/git/differentialabundance/error/progro.tsv", quote=F) + +if ("Majority_protein_IDs" %in% colnames(quant.table)) { + #proteinColumns <- gsub(" ", ".", (proteinColumns)) + #proteinColumns <- make.names(proteinColumns) + #names(proteinColumns) <- names(proteus::proteinColumns) + proteinColumns <- setNames(gsub(" ", "_", proteinColumns), names(proteus::proteinColumns)) + + capture.output((proteinColumns), file="/home/iivow01/git/differentialabundance/error/protcol2") +} + +proteinGroups <- readProteinGroups( + file=opt\$quant_file, + meta=sample.sheet, + measure.cols=measure.cols, + data.cols=proteinColumns #c('Majority protein IDs', 'Potential contaminant', 'Reverse') +) + + +capture.output(proteinGroups, file="/home-link/iivow01/git/differentialabundance/error/proteingroups") +capture.output(str(proteinGroups), file="/home-link/iivow01/git/differentialabundance/error/proteingroupsstr") + +write("1", file="/home-link/iivow01/git/differentialabundance/error/status") + +write.table(proteinGroups\$tab, file="/home-link/iivow01/git/differentialabundance/error/tab", quote=F) +write("2", file="/home-link/iivow01/git/differentialabundance/error/status", append=T) + +# Generate plots for all requested normalizations; also, save +# normalized protein groups for limma +for (normfun in normfuns) { + proteinGroups.normalized <- normalizeData(proteinGroups, norm.fun = eval(parse(text=normfun))) # Proteus also accepts other norm.funs, e.g. from limma + proteinGroups.normalized\$tab <- na.omit(log2(proteinGroups.normalized\$tab)) + png(paste0('proteus.', normfun, '_normalised_distributions.png'), width = 5*300, height = 5*300, res = 300, pointsize = 8) + print(plotSampleDistributions(proteinGroups.normalized, title=paste("Sample distributions after applying", normfun), fill="condition", method=opt\$plotSampleDistributions_method) + scale_fill_brewer(palette=opt\$palette_name)) + dev.off() + write("2.3", file="/home-link/iivow01/git/differentialabundance/error/status", append=T) + + png(paste0('proteus.', normfun, '_normalised_mean_variance_relationship.png'), width = 5*300, height = 5*300, res = 300, pointsize = 8) + print(plotMV(proteinGroups.normalized, with.loess=opt\$plotMV_loess) + scale_fill_distiller(palette=opt\$palette_name)) #, title=paste("Sample mean variance relationship after applying", normfun) + dev.off() + write("2.6", file="/home-link/iivow01/git/differentialabundance/error/status", append=T) + + png(paste0('proteus.', normfun, '_normalised_dendrogram.png'), width = 5*300, height = 5*300, res = 300, pointsize = 8) + print(plotClustering(proteinGroups.normalized), title=paste("Sample clustering after applying", normfun)) + dev.off() + + + + summary <- summary(proteinGroups.normalized) + + # R object for other processes to use + saveRDS(proteinGroups.normalized, file = paste0('proteus.', normfun, 'normalised_proteingroups.rds')) +write("3", file="/home-link/iivow01/git/differentialabundance/error/status", append=T) + + + # Write normalized count matrix + write.table( + data.frame( + gene_id = rownames(proteinGroups.normalized\$tab), + proteinGroups.normalized\$tab, + check.names = FALSE + ), + file = paste(output_prefix, 'proteus', normfun, 'normalised_proteingroups_tab', 'tsv', sep = '.'), + col.names = TRUE, + row.names = FALSE, + sep = '\t', + quote = FALSE + ) + write.table( + data.frame( + gene_id = rownames(proteinGroups.normalized\$tab), + proteinGroups.normalized\$tab, + check.names = FALSE + ), + file = "/home-link/iivow01/git/differentialabundance/error/tabnorm.normaliseMedian.tsv", + col.names = TRUE, + row.names = FALSE, + sep = '\t', + quote = FALSE + ) + write.table( + data.frame( + gene_id = rownames(proteinGroups.normalized\$tab), + proteinGroups.normalized\$tab, + check.names = FALSE + ), + file = paste(output_prefix, 'proteus', normfun, 'normalised_proteingroups_tab2', 'tsv', sep = '.'), + col.names = TRUE, + row.names = FALSE, + sep = '\t', + quote = FALSE + ) + write.table( + data.frame( + gene_id = rownames(proteinGroups.normalized\$tab), + proteinGroups.normalized\$tab, + check.names = FALSE + ), + file = paste("/home-link/iivow01/git/differentialabundance/error/waaaaas.", normfun, ".tsv", sep = ''), + col.names = TRUE, + row.names = FALSE, + sep = '\t', + quote = FALSE + ) + + write("5", file="/home-link/iivow01/git/differentialabundance/error/status", append=T) + +} + + +# Remove NAs as these will otherwise mess with some of the other modules +# TODO should I also leave the log2 here (or log10)? If so, I think I have to apply it only after doing the norms as otherwise, +# every norm table will be logged twice (or I could log the raw table and NOT the norms, but that does not work) +proteinGroups\$tab <- na.omit(log2(proteinGroups\$tab)) + +# Generate raw distribution plot +png('proteus.raw_distributions.png', width = 5*300, height = 5*300, res = 300, pointsize = 8) +print(plotSampleDistributions(proteinGroups, title="Raw sample distributions", fill="condition", method=opt\$plotSampleDistributions_method) + scale_fill_brewer(palette=opt\$palette_name)) +dev.off() + +# R object for other processes to use +saveRDS(proteinGroups, file = 'proteus.raw_proteingroups.rds') + +# Write raw count matrix +write.table( + data.frame( + gene_id = rownames(proteinGroups\$tab), + proteinGroups\$tab, + check.names = FALSE + ), + file = paste(output_prefix, 'proteus', 'raw_proteingroups_tab', 'tsv', sep = '.'), + col.names = TRUE, + row.names = FALSE, + sep = '\t', + quote = FALSE +) + +################################################ +################################################ +## R SESSION INFO ## +################################################ +################################################ + +sink("R_sessionInfo.log") +print(sessionInfo()) +sink() + +################################################ +################################################ +## VERSIONS FILE ## +################################################ +################################################ +#TODO +r.version <- strsplit(version[['version.string']], ' ')[[1]][3] +limma.version <- as.character(packageVersion('limma')) +plotly.version <- as.character(packageVersion('plotly')) +proteus.version <- as.character(packageVersion('proteus')) +#TODO: change mparker2 +# writeLines( +# c( +# '"${task.process}":', +# paste(' r-base:', r.version), +# paste(' bioconductor-limma:', limma.version), +# paste(' r-plotly:', plotly.version), +# paste(' mparker2-proteus:', proteus.version), +# ), +# 'versions.yml') + +################################################ +################################################ +################################################ +################################################ diff --git a/modules/nf-core/rmarkdownnotebook/main.nf b/modules/nf-core/rmarkdownnotebook/main.nf index 45ed550b..ec8f21b1 100644 --- a/modules/nf-core/rmarkdownnotebook/main.nf +++ b/modules/nf-core/rmarkdownnotebook/main.nf @@ -7,10 +7,10 @@ process RMARKDOWNNOTEBOOK { //NB: You likely want to override this with a container containing all required //dependencies for your analysis. The container at least needs to contain the //yaml and rmarkdown R packages. - conda "conda-forge::r-base=4.1.0 conda-forge::r-rmarkdown=2.9 conda-forge::r-yaml=2.2.1" + conda "conda-forge::r-base=4.1.0 conda-forge::r-rmarkdown=2.9 conda-forge::r-yaml=2.2.1 anaconda::gmp=6.2.1 conda-forge::r-ggplot2=3.4.2" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/mulled-v2-31ad840d814d356e5f98030a4ee308a16db64ec5:0e852a1e4063fdcbe3f254ac2c7469747a60e361-0' : - 'biocontainers/mulled-v2-31ad840d814d356e5f98030a4ee308a16db64ec5:0e852a1e4063fdcbe3f254ac2c7469747a60e361-0' }" + 'docker.io/library/pxnotebook_local' }" input: tuple val(meta), path(notebook) diff --git a/modules/nf-core/shinyngs/app/main.nf b/modules/nf-core/shinyngs/app/main.nf index 7d601105..d7c03291 100644 --- a/modules/nf-core/shinyngs/app/main.nf +++ b/modules/nf-core/shinyngs/app/main.nf @@ -40,7 +40,7 @@ process SHINYNGS_APP { make_app_from_files.R \\ --sample_metadata $sample \\ --feature_metadata $feature_meta \\ - --assay_files ${assay_files.join(',')} \\ + --assay_files "/home/iivow01/git/differentialabundance/results_Px_noNA/proteus/output_prefix.proteus.raw_proteingroups_tab.tsv,/home/iivow01/git/differentialabundance/results_Px_noNA/proteus/output_prefix.proteus.normalizeMedian.normalised_proteingroups_tab.tsv" \\ --contrast_file $contrasts \\ --contrast_stats_assay $contrast_stats_assay \\ --differential_results ${differential_results.join(',')} \\ @@ -54,3 +54,11 @@ process SHINYNGS_APP { END_VERSIONS """ } + +// --assay_files ${assay_files.join(',')} \\ + +// --differential_results ${differential_results.join(',')} \\ + +// --assay_files "/home/iivow01/git/differentialabundance/results_Px_noNA/proteus/output_prefix.proteus.raw_proteingroups_tab.tsv,/home/iivow01/git/differentialabundance/results_Px_noNA/proteus/output_prefix.proteus.normalizeMedian.normalised_proteingroups_tab.tsv" \\ + +// --differential_results "/home/iivow01/git/differentialabundance/results_Px_noNA/tables/differential/Condition__genotype-WT-NFAT1_plus_2_minus_KO.limma.results.tsv,/home/iivow01/git/differentialabundance/results_Px_noNA/tables/differential/Condition__genotype-WT-NFAT1_minus_KO.limma.results.tsv,/home/iivow01/git/differentialabundance/results_Px_noNA/tables/differential/Condition__genotype-WT-NFAT2_minus_KO.limma.results.tsv" \\ diff --git a/nextflow.config b/nextflow.config index 731494e9..a62f37a8 100644 --- a/nextflow.config +++ b/nextflow.config @@ -40,10 +40,6 @@ params { features_name_col = 'gene_name' features_metadata_cols = 'gene_id,gene_name,gene_biotype' - // GTF parsing options - features_gtf_feature_type = 'transcript' - features_gtf_table_first_field = 'gene_id' - // Affy-specific options affy_cel_files_archive = null affy_file_name_col = 'file' diff --git a/nextflow_schema.json b/nextflow_schema.json index 80241d6a..1b7c0c39 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -24,7 +24,7 @@ "default": "rnaseq", "description": "A string identifying the technology used to produce the data", "help_text": "Currently 'rnaseq' or 'affy_array' may be specified.", - "enum": ["rnaseq", "affy_array"], + "enum": ["rnaseq", "affy_array", "Px"], "fa_icon": "far fa-keyboard" }, "input": { @@ -165,18 +165,6 @@ "description": "This parameter allows you to supply your own feature annotations. These can often be automatically derived from the GTF used upstream for RNA-seq, or from the Bioconductor annotation package (for affy arrays). ", "help_text": "This parameter allows you to supply your own feature annotations. These can often be automatically derived from the GTF used upstream for RNA-seq, or from the Bioconductor annotation package (for affy arrays). ", "fa_icon": "fas fa-align-justify" - }, - "features_gtf_feature_type": { - "type": "string", - "default": "transcript", - "description": "Where a GTF file is supplied, which feature type to use", - "fa_icon": "fas fa-keyboard" - }, - "features_gtf_table_first_field": { - "type": "string", - "default": "gene_id", - "description": "Where a GTF file is supplied, which field should go first in the converted output table", - "fa_icon": "fas fa-fast-backward" } }, "required": ["features_id_col", "features_name_col", "features_type"], diff --git a/pxnotebook_env.yml b/pxnotebook_env.yml new file mode 100644 index 00000000..8049c896 --- /dev/null +++ b/pxnotebook_env.yml @@ -0,0 +1,21 @@ +# You can use this file to create a conda environment for this pipeline: +# conda env create -f environment.yml +# use this to find packages: https://anaconda.org/ +name: pxnotebook +channels: + - bioconda + - conda-forge + - anaconda +dependencies: + - anaconda::gmp=6.2.1 + - bioconda::r-shinyngs=1.7.2 + - conda-forge::r-base=4.2.3 + - conda-forge::r-dplyr=1.1.2 + - conda-forge::r-dt=0.28 + - conda-forge::r-knitr=1.43 + - conda-forge::r-plotly=4.10.1 + - conda-forge::r-rmarkdown=2.21 + - conda-forge::r-tidyverse=2.0.0  + - conda-forge::r-yaml=2.3.7 + - conda-forge::r-ggplot2=3.4.2 + - conda-forge::r-upsetr=1.4.0 \ No newline at end of file diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index 198f2a71..2e50399c 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -22,9 +22,10 @@ if (params.study_type == 'affy_array'){ } else { error("CEL files archive not specified!") } -} else{ - - // If this is not an affy array, assume we're reading from a matrix +} else if (params.study_type == 'Px') { + proteus_in = Channel.of([ exp_meta, file(params.input), file(params.matrix) ]) +} else { + // If this is not an affy array or maxquant output, assume we're reading from a matrix if (params.matrix) { matrix_file = file(params.matrix, checkIfExists: true) @@ -91,6 +92,7 @@ include { CUSTOM_TABULARTOGSEACLS } from '../modules/n include { RMARKDOWNNOTEBOOK } from '../modules/nf-core/rmarkdownnotebook/main' include { AFFY_JUSTRMA as AFFY_JUSTRMA_RAW } from '../modules/nf-core/affy/justrma/main' include { AFFY_JUSTRMA as AFFY_JUSTRMA_NORM } from '../modules/nf-core/affy/justrma/main' +include { PROTEUS } from '../modules/nf-core/proteus/main' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -136,8 +138,13 @@ workflow DIFFERENTIALABUNDANCE { ch_in_norm = AFFY_JUSTRMA_NORM.out.expression ch_affy_platform_features = AFFY_JUSTRMA_RAW.out.annotation + } else if (params.study_type == 'Px') { + PROTEUS(proteus_in) + ch_in_raw = PROTEUS.out.tab + ch_in_norm = PROTEUS.out.normtab } + //// Fetch or derive a feature annotation table // If user has provided a feature annotation table, use that @@ -173,11 +180,23 @@ workflow DIFFERENTIALABUNDANCE { .mix(GTF_TO_TABLE.out.versions) } else{ - - // Otherwise we can just use the matrix input + if (params.study_type == 'Px'){ + ch_features = PROTEUS.out.normtab.map{ + matrix_as_anno_filename = "matrix_as_anno.${it[1].getExtension()}" + it[1].copyTo(matrix_as_anno_filename) + it[1] = file(matrix_as_anno_filename) + it + }.dump(tag:'waaaaa') + + //ch_features = PROTEUS.out.normtab2 //Channel.of([ exp_meta, file(matrix_as_anno_filename)]) + + } + else { + // Otherwise we can just use the matrix input matrix_as_anno_filename = "matrix_as_anno.${matrix_file.getExtension()}" matrix_file.copyTo(matrix_as_anno_filename) ch_features = Channel.of([ exp_meta, file(matrix_as_anno_filename)]) + } } // Channel for the contrasts file @@ -190,13 +209,19 @@ workflow DIFFERENTIALABUNDANCE { ch_matrices_for_validation = ch_in_raw .join(ch_in_norm) .map{tuple(it[0], [it[1], it[2]])} + } else if (params.study_type == 'Px') { + ch_matrices_for_validation = ch_in_raw + .join(ch_in_norm) + .dump(tag:'matval_px1') + .map{tuple(it[0], [it[1], it[2]])} + .dump(tag:'matval_px') } else{ ch_matrices_for_validation = ch_in_raw } VALIDATOR( - ch_input.join(ch_matrices_for_validation), + ch_input.join(ch_matrices_for_validation).dump(tag:'val_input'), ch_features, ch_contrasts_file ) @@ -204,7 +229,7 @@ workflow DIFFERENTIALABUNDANCE { // For Affy, we've validated multiple input matrices for raw and norm, // we'll separate them out again here - if (params.study_type == 'affy_array'){ + if (params.study_type == 'affy_array' || params.study_type == 'Px'){ ch_validated_assays = VALIDATOR.out.assays .transpose() .branch { @@ -214,6 +239,9 @@ workflow DIFFERENTIALABUNDANCE { ch_raw = ch_validated_assays.raw ch_norm = ch_validated_assays.normalised ch_matrix_for_differential = ch_norm + VALIDATOR.out.assays.dump(tag:'valassay') + ch_norm.dump(tag:'ch_norm') + } else{ ch_raw = VALIDATOR.out.assays ch_matrix_for_differential = ch_raw @@ -234,20 +262,28 @@ workflow DIFFERENTIALABUNDANCE { tuple(it, it.variable, it.reference, it.target) } - // Firstly Filter the input matrix + if (params.study_type == 'Px') { + ch_samples_and_matrix = VALIDATOR.out.sample_meta + .join(ch_matrix_for_differential) // -> meta, samplesheet, unfiltered matrix + .first() + } else { + // Firstly Filter the input matrix + ch_matrix_for_differential.dump(tag:'differentialmat') + VALIDATOR.out.sample_meta.dump(tag:'val_out_samplemeta') + CUSTOM_MATRIXFILTER( + ch_matrix_for_differential, + VALIDATOR.out.sample_meta + ) - CUSTOM_MATRIXFILTER( - ch_matrix_for_differential, - VALIDATOR.out.sample_meta - ) - // Prepare inputs for differential processes + // Prepare inputs for differential processes + ch_samples_and_matrix = VALIDATOR.out.sample_meta + .join(CUSTOM_MATRIXFILTER.out.filtered) // -> meta, samplesheet, filtered matrix + .first() + } - ch_samples_and_matrix = VALIDATOR.out.sample_meta - .join(CUSTOM_MATRIXFILTER.out.filtered) // -> meta, samplesheet, filtered matrix - .first() - if (params.study_type == 'affy_array'){ + if (params.study_type == 'affy_array' || params.study_type == 'Px'){ LIMMA_DIFFERENTIAL ( ch_contrasts, @@ -373,7 +409,7 @@ workflow DIFFERENTIALABUNDANCE { ch_contrast_variables .combine(ch_all_matrices.map{ it.tail() }) - +/* PLOT_EXPLORATORY( ch_contrast_variables .combine(ch_all_matrices.map{ it.tail() }) @@ -392,7 +428,7 @@ workflow DIFFERENTIALABUNDANCE { .mix(VALIDATOR.out.versions) .mix(PLOT_EXPLORATORY.out.versions) .mix(PLOT_DIFFERENTIAL.out.versions) - +*/ CUSTOM_DUMPSOFTWAREVERSIONS ( ch_versions.unique().collectFile(name: 'collated_versions.yml') ) @@ -441,8 +477,8 @@ workflow DIFFERENTIALABUNDANCE { .combine(ch_differential.map{it[1]}.collect().map{[it]}) SHINYNGS_APP( - ch_all_matrices, // meta, samples, features, [ matrices ] - ch_app_differential, // meta, contrasts, [differential results] + ch_all_matrices.dump(tag:'shiny1'), // meta, samples, features, [ matrices ] + ch_app_differential.dump(tag:'shiny2'), // meta, contrasts, [differential results] params.exploratory_assay_names.split(',').findIndexOf { it == params.exploratory_final_assay } + 1 ) ch_versions = ch_versions.mix(SHINYNGS_APP.out.versions) @@ -458,7 +494,7 @@ workflow DIFFERENTIALABUNDANCE { // Condition params reported on study type def params_pattern = ~/^(report|study|observations|features|filtering|exploratory|differential|deseq2|gsea).*/ - if (params.study_type == 'affy_array'){ + if (params.study_type == 'affy_array' || params.study_type == 'Px'){ params_pattern = ~/^(report|study|observations|features|filtering|exploratory|differential|affy|limma|gsea).*/ } From 5356b0c1df163dfa6cf7bfa84947597e4522ad19 Mon Sep 17 00:00:00 2001 From: WackerO Date: Mon, 26 Jun 2023 08:30:51 +0200 Subject: [PATCH 02/30] progress save for Px --- assets/differentialabundance_report.Rmd | 5 + .../limma/differential/templates/limma_de.R | 8 +- .../templates/proteus_readproteingroups.R | 199 ++++++------------ modules/nf-core/rmarkdownnotebook/main.nf | 1 + .../shinyngs/validatefomcomponents/main.nf | 1 + workflows/differentialabundance.nf | 6 +- 6 files changed, 74 insertions(+), 146 deletions(-) diff --git a/assets/differentialabundance_report.Rmd b/assets/differentialabundance_report.Rmd index 1b460a1a..8b98174c 100644 --- a/assets/differentialabundance_report.Rmd +++ b/assets/differentialabundance_report.Rmd @@ -207,6 +207,7 @@ if (! params$observations_name_col %in% colnames(observations)){ } if (! is.null(params$features)){ + write(params$features, file="/home/iivow01/git/differentialabundance/error2/paramfeatures") features <- read_metadata(file.path(params$input_dir, params$features)) features <- features[,colnames(features) %in% simpleSplit(params$features_metadata_cols), drop = FALSE] } @@ -322,6 +323,10 @@ differential_results <- lapply(differential_files, function(diff_file){ } # Annotate differential tables if possible + write(colnames(features), file="/home-link/iivow01/git/differentialabundance/error2/featurescols") + write(colnames(params$features_id_col), file="/home-link/iivow01/git/differentialabundance/error2/featuresbla") + write(colnames(params$differential_feature_id_column), file="/home-link/iivow01/git/differentialabundance/error2/featuresbla", append=T) + write.table((features), file="/home-link/iivow01/git/differentialabundance/error2/featurescols.tsv", sep="\t", quote=F) if (! is.null(params$features)){ diff <- merge(features, diff, by.x = params$features_id_col, by.y = params$differential_feature_id_column) diff --git a/modules/nf-core/limma/differential/templates/limma_de.R b/modules/nf-core/limma/differential/templates/limma_de.R index cdcf14f3..1e0ab5a8 100755 --- a/modules/nf-core/limma/differential/templates/limma_de.R +++ b/modules/nf-core/limma/differential/templates/limma_de.R @@ -147,8 +147,8 @@ library(limma) ## READ IN COUNTS FILE AND SAMPLE METADATA ## ################################################ ################################################ -file.copy(opt\$count_file, "/home-link/iivow01/git/differentialabundance/error2/counts") -file.copy(opt\$sample_file, "/home-link/iivow01/git/differentialabundance/error2/sample_file") +write("0", file="/home-link/iivow01/git/differentialabundance/error2/aisdliasdlaisd") + intensities.table <- read_delim_flexible( file = opt\$count_file, @@ -156,8 +156,10 @@ intensities.table <- row.names = opt\$probe_id_col, check.names = FALSE ) -sample.sheet <- read_delim_flexible(file = opt\$sample_file) +write("1", file="/home-link/iivow01/git/differentialabundance/error2/aisdliasdlaisd") +sample.sheet <- read_delim_flexible(file = opt\$sample_file) +write("2", file="/home-link/iivow01/git/differentialabundance/error2/aisdliasdlaisd") # Deal with spaces that may be in sample column opt\$sample_id_col <- make.names(opt\$sample_id_col) diff --git a/modules/nf-core/proteus/templates/proteus_readproteingroups.R b/modules/nf-core/proteus/templates/proteus_readproteingroups.R index ec9b0a68..b7bc7e6b 100755 --- a/modules/nf-core/proteus/templates/proteus_readproteingroups.R +++ b/modules/nf-core/proteus/templates/proteus_readproteingroups.R @@ -152,6 +152,7 @@ opt <- list( quant_file = '$quants', sample_file = '$samplesheet', contrast_variable = NULL, + protein_id_col = 'Majority protein IDs', sample_id_col = 'sample', measure_col_prefix = 'Intensity', normfuns = 'normalizeMedian', @@ -203,81 +204,32 @@ for (file_input in c('quant_file', 'sample_file')){ ## Finish loading libraries ## ################################################ ################################################ -mytmp <- tempdir() - - -# if (!requireNamespace("BiocManager", quietly = TRUE)) { -# install.packages("BiocManager",repos = "http://cran.us.r-project.org", lib=mytmp) -# } -# library("BiocManager", lib.loc=mytmp) -# if (!require("rmarkdown")){ -# BiocManager::install("rmarkdown", lib=mytmp) -# library("rmarkdown", lib.loc=mytmp) -# } -# if (!require("miniUI")){ -# BiocManager::install("miniUI", lib=mytmp) -# library("miniUI", lib.loc=mytmp) -# } -# if (!require("pkgdown")){ -# BiocManager::install("pkgdown", lib=mytmp) -# library("pkgdown", lib.loc=mytmp) -# } -# if (!require("devtools")){ -# BiocManager::install("devtools", lib=mytmp) -# library("devtools", lib.loc=mytmp) -# } -# if (!require("limma")){ -# BiocManager::install("limma", lib=mytmp) -# library("limma", lib.loc=mytmp) -# } -# if (!require("ggplot2")){ -# BiocManager::install("ggplot2", lib=mytmp) -# library("ggplot2", lib.loc=mytmp) -# } -# if (!require("ggplot2")) { -# install.packages("ggplot2",repos = "http://cran.us.r-project.org", lib=mytmp) -# library("ggplot2", lib.loc=mytmp) -# } -# devtools::install_github("tidyverse/ggplot2", lib=mytmp) -# library("ggplot2", lib.loc=mytmp) - - -# if (!require("plotly")){ -# BiocManager::install("plotly", lib=mytmp) -# library("plotly", lib.loc=mytmp) -# } -# if (!require("proteus")){ -# devtools::install_github("bartongroup/Proteus", lib=mytmp, -# build_opts= c("--no-resave-data", "--no-manual"), build_vignettes=FALSE) -# library("proteus", lib.loc=mytmp) -# } - library(limma) library(plotly) library(proteus) -assignInNamespace("readEvidenceFile", customreadEvidenceFile, ns = "proteus") + ################################################ ################################################ ## READ IN COUNTS FILE AND SAMPLE METADATA ## ################################################ ################################################ -#opt\$quant_file, + quant.table <- read_delim_flexible( file = opt\$quant_file, - check.names = FALSE, - row.names = 1 + check.names = FALSE ) -write.table(quant.table, file="/home-link/iivow01/git/differentialabundance/error/quanttablllllllllle.tsv", sep="\t") + sample.sheet <- read_delim_flexible( file = opt\$sample_file, check.names=FALSE ) -# Deal with spaces that may be in sample column -#opt\$sample_id_col <- make.names(opt\$sample_id_col) +if (! opt\$protein_id_col %in% colnames(quant.table)){ + stop(paste0("Specified protein ID column '", opt\$protein_id_col, "' is not in the quant table")) +} if (! opt\$sample_id_col %in% colnames(sample.sheet)){ stop(paste0("Specified sample ID column '", opt\$sample_id_col, "' is not in the sample sheet")) @@ -287,8 +239,6 @@ if (! opt\$sample_id_col %in% colnames(sample.sheet)){ sample.sheet\$sample <- sample.sheet[[opt\$sample_id_col]] - - #opt\$contrast_variable <- make.names(opt\$contrast_variable) sample.sheet\$condition <- sample.sheet[[opt\$contrast_variable]] @@ -327,7 +277,7 @@ if (length(missing_columns) > 0) { 'column in quant table. The following columns are missing:', paste(missing_columns, collapse = ', ') )) -} else{ +} else { # Save any non-quant data, with gene metadata etc we might need later # TODO: Maybe just save the whole quant file? (or not; not sure the rest is ever needed) nonquant.table <- @@ -359,34 +309,21 @@ if (length(invalid_normfuns)>0) { ################################################ ################################################ -# TODO -output_prefix <- "output_prefix" +output_prefix <- opt\$contrast_variable # TODO: Add link to https://rdrr.io/github/bartongroup/Proteus/man/readProteinGroups.html to docu and mention the necessary columns! write.table(read.table(opt\$quant_file, sep="\t", header=T, check.names=F), file="/home/iivow01/git/differentialabundance/error/wtf.tsv", quote=F, sep="\t") - -proteinColumns <- proteus::proteinColumns -capture.output((proteinColumns), file="/home/iivow01/git/differentialabundance/error/protcol") -#write.table(quant.table, file="/home-link/iivow01/git/differentialabundance/error/progro.tsv", quote=F) - -if ("Majority_protein_IDs" %in% colnames(quant.table)) { - #proteinColumns <- gsub(" ", ".", (proteinColumns)) - #proteinColumns <- make.names(proteinColumns) - #names(proteinColumns) <- names(proteus::proteinColumns) - proteinColumns <- setNames(gsub(" ", "_", proteinColumns), names(proteus::proteinColumns)) - - capture.output((proteinColumns), file="/home/iivow01/git/differentialabundance/error/protcol2") -} +# Replace proteus default ID column with user param and re-set the names of the resulting object (gsub sets the names to NULL) +proteinColumns <- setNames(gsub("Majority protein IDs", opt\$protein_id_col, proteus::proteinColumns), names(proteus::proteinColumns)) proteinGroups <- readProteinGroups( file=opt\$quant_file, meta=sample.sheet, measure.cols=measure.cols, - data.cols=proteinColumns #c('Majority protein IDs', 'Potential contaminant', 'Reverse') + data.cols=proteinColumns ) - capture.output(proteinGroups, file="/home-link/iivow01/git/differentialabundance/error/proteingroups") capture.output(str(proteinGroups), file="/home-link/iivow01/git/differentialabundance/error/proteingroupsstr") @@ -400,18 +337,30 @@ write("2", file="/home-link/iivow01/git/differentialabundance/error/status", app for (normfun in normfuns) { proteinGroups.normalized <- normalizeData(proteinGroups, norm.fun = eval(parse(text=normfun))) # Proteus also accepts other norm.funs, e.g. from limma proteinGroups.normalized\$tab <- na.omit(log2(proteinGroups.normalized\$tab)) - png(paste0('proteus.', normfun, '_normalised_distributions.png'), width = 5*300, height = 5*300, res = 300, pointsize = 8) - print(plotSampleDistributions(proteinGroups.normalized, title=paste("Sample distributions after applying", normfun), fill="condition", method=opt\$plotSampleDistributions_method) + scale_fill_brewer(palette=opt\$palette_name)) + + png(paste0(output_prefix, '.proteus.', normfun, '_normalised_distributions.png'), width = 5*300, height = 5*300, res = 300, pointsize = 8) + print( + plotSampleDistributions(proteinGroups.normalized, title=paste0("Sample distributions after applying\n", normfun), fill="condition", method=opt\$plotSampleDistributions_method) + + scale_fill_brewer(palette=opt\$palette_name, name=opt\$contrast_variable) + + theme(plot.title = element_text(size = 12)) + ) dev.off() - write("2.3", file="/home-link/iivow01/git/differentialabundance/error/status", append=T) - - png(paste0('proteus.', normfun, '_normalised_mean_variance_relationship.png'), width = 5*300, height = 5*300, res = 300, pointsize = 8) - print(plotMV(proteinGroups.normalized, with.loess=opt\$plotMV_loess) + scale_fill_distiller(palette=opt\$palette_name)) #, title=paste("Sample mean variance relationship after applying", normfun) + + png(paste0(output_prefix, '.proteus.', normfun, '_normalised_mean_variance_relationship.png'), width = 5*300, height = 5*300, res = 300, pointsize = 8) + print( + plotMV(proteinGroups.normalized, with.loess=opt\$plotMV_loess) + + ggtitle(paste0("Sample mean variance relationship after applying\n", normfun)) + + scale_fill_distiller(palette=opt\$palette_name) + + theme(plot.title = element_text(size = 12)) + ) dev.off() - write("2.6", file="/home-link/iivow01/git/differentialabundance/error/status", append=T) - png(paste0('proteus.', normfun, '_normalised_dendrogram.png'), width = 5*300, height = 5*300, res = 300, pointsize = 8) - print(plotClustering(proteinGroups.normalized), title=paste("Sample clustering after applying", normfun)) + png(paste0(output_prefix, '.proteus.', normfun, '_normalised_dendrogram.png'), width = 5*300, height = 5*300, res = 300, pointsize = 8) + print( + plotClustering(proteinGroups.normalized) + + ggtitle(paste0("Sample clustering after applying\n", normfun)) + + theme(plot.title = element_text(size = 12)) + ) dev.off() @@ -419,62 +368,24 @@ for (normfun in normfuns) { summary <- summary(proteinGroups.normalized) # R object for other processes to use - saveRDS(proteinGroups.normalized, file = paste0('proteus.', normfun, 'normalised_proteingroups.rds')) -write("3", file="/home-link/iivow01/git/differentialabundance/error/status", append=T) - + saveRDS(proteinGroups.normalized, file = paste0(output_prefix, '.proteus.', normfun, 'normalised_proteingroups.rds')) # Write normalized count matrix - write.table( - data.frame( - gene_id = rownames(proteinGroups.normalized\$tab), - proteinGroups.normalized\$tab, - check.names = FALSE - ), - file = paste(output_prefix, 'proteus', normfun, 'normalised_proteingroups_tab', 'tsv', sep = '.'), - col.names = TRUE, - row.names = FALSE, - sep = '\t', - quote = FALSE - ) - write.table( - data.frame( - gene_id = rownames(proteinGroups.normalized\$tab), - proteinGroups.normalized\$tab, - check.names = FALSE - ), - file = "/home-link/iivow01/git/differentialabundance/error/tabnorm.normaliseMedian.tsv", - col.names = TRUE, - row.names = FALSE, - sep = '\t', - quote = FALSE - ) - write.table( - data.frame( - gene_id = rownames(proteinGroups.normalized\$tab), - proteinGroups.normalized\$tab, - check.names = FALSE - ), - file = paste(output_prefix, 'proteus', normfun, 'normalised_proteingroups_tab2', 'tsv', sep = '.'), - col.names = TRUE, - row.names = FALSE, - sep = '\t', - quote = FALSE + out_df <- data.frame( + proteinGroups.normalized\$tab, + check.names = FALSE ) + out_df[[opt\$protein_id_col]] = rownames(proteinGroups.normalized\$tab) + out_df <- out_df[c(opt\$protein_id_col, colnames(out_df)[colnames(out_df) != opt\$protein_id_col])] + write.table( - data.frame( - gene_id = rownames(proteinGroups.normalized\$tab), - proteinGroups.normalized\$tab, - check.names = FALSE - ), - file = paste("/home-link/iivow01/git/differentialabundance/error/waaaaas.", normfun, ".tsv", sep = ''), + out_df, + file = paste(output_prefix, 'proteus', normfun, 'normalised_proteingroups_tab', 'tsv', sep = '.'), col.names = TRUE, row.names = FALSE, sep = '\t', quote = FALSE ) - - write("5", file="/home-link/iivow01/git/differentialabundance/error/status", append=T) - } @@ -484,20 +395,28 @@ write("3", file="/home-link/iivow01/git/differentialabundance/error/status", app proteinGroups\$tab <- na.omit(log2(proteinGroups\$tab)) # Generate raw distribution plot -png('proteus.raw_distributions.png', width = 5*300, height = 5*300, res = 300, pointsize = 8) -print(plotSampleDistributions(proteinGroups, title="Raw sample distributions", fill="condition", method=opt\$plotSampleDistributions_method) + scale_fill_brewer(palette=opt\$palette_name)) +png(paste0(output_prefix, '.proteus.raw_distributions.png'), width = 5*300, height = 5*300, res = 300, pointsize = 8) +print( + plotSampleDistributions(proteinGroups, title="Raw sample distributions", fill="condition", method=opt\$plotSampleDistributions_method) + + scale_fill_brewer(palette=opt\$palette_name, name=opt\$contrast_variable) + + theme(plot.title = element_text(size = 12)) + ) dev.off() # R object for other processes to use -saveRDS(proteinGroups, file = 'proteus.raw_proteingroups.rds') +saveRDS(proteinGroups, file = paste0(output_prefix, '.proteus.raw_proteingroups.rds')) # Write raw count matrix -write.table( - data.frame( - gene_id = rownames(proteinGroups\$tab), +out_df <- data.frame( proteinGroups\$tab, check.names = FALSE - ), + ) +out_df[[opt\$protein_id_col]] = rownames(proteinGroups\$tab) +out_df <- out_df[c(opt\$protein_id_col, colnames(out_df)[colnames(out_df) != opt\$protein_id_col])] + + +write.table( + out_df, file = paste(output_prefix, 'proteus', 'raw_proteingroups_tab', 'tsv', sep = '.'), col.names = TRUE, row.names = FALSE, @@ -539,4 +458,4 @@ proteus.version <- as.character(packageVersion('proteus')) ################################################ ################################################ ################################################ -################################################ +################################################ \ No newline at end of file diff --git a/modules/nf-core/rmarkdownnotebook/main.nf b/modules/nf-core/rmarkdownnotebook/main.nf index ec8f21b1..4bcdcc35 100644 --- a/modules/nf-core/rmarkdownnotebook/main.nf +++ b/modules/nf-core/rmarkdownnotebook/main.nf @@ -60,6 +60,7 @@ process RMARKDOWNNOTEBOOK { } """ + echo $parameters > /home/iivow01/git/differentialabundance/error2/parameters # Dump .params.yml heredoc (section will be empty if parametrization is disabled) ${indent_code_block(params_cmd, 4)} diff --git a/modules/nf-core/shinyngs/validatefomcomponents/main.nf b/modules/nf-core/shinyngs/validatefomcomponents/main.nf index 4a80e042..97ec6448 100644 --- a/modules/nf-core/shinyngs/validatefomcomponents/main.nf +++ b/modules/nf-core/shinyngs/validatefomcomponents/main.nf @@ -30,6 +30,7 @@ process SHINYNGS_VALIDATEFOMCOMPONENTS { def feature = feature_meta ? "--feature_metadata '$feature_meta'" : '' """ + echo $args > "/home-link/iivow01/git/differentialabundance/error2/val" validate_fom_components.R \\ --sample_metadata "$sample" \\ $feature \\ diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index 2e50399c..7c86dbdc 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -148,7 +148,6 @@ workflow DIFFERENTIALABUNDANCE { //// Fetch or derive a feature annotation table // If user has provided a feature annotation table, use that - if (params.features){ ch_features = Channel.of([ exp_meta, file(params.features, checkIfExists: true)]) } else if (params.study_type == 'affy_array'){ @@ -219,7 +218,8 @@ workflow DIFFERENTIALABUNDANCE { else{ ch_matrices_for_validation = ch_in_raw } - + print("häää") + print(params.observations_id_col) VALIDATOR( ch_input.join(ch_matrices_for_validation).dump(tag:'val_input'), ch_features, @@ -505,7 +505,7 @@ workflow DIFFERENTIALABUNDANCE { } // Render the final report - + ch_report_params.dump(tag:'ch_report_params') RMARKDOWNNOTEBOOK( ch_report_file, ch_report_params, From ca8747297335f5d1902c3a8074359b12786fa4ef Mon Sep 17 00:00:00 2001 From: WackerO Date: Tue, 27 Jun 2023 09:38:28 +0200 Subject: [PATCH 03/30] progress save --- assets/Px_report.Rmd | 979 ++++++++++++++++++ modules/nf-core/proteus/main.nf | 2 +- .../templates/proteus_readproteingroups.R | 145 +-- 3 files changed, 1010 insertions(+), 116 deletions(-) create mode 100644 assets/Px_report.Rmd diff --git a/assets/Px_report.Rmd b/assets/Px_report.Rmd new file mode 100644 index 00000000..2b35a0d7 --- /dev/null +++ b/assets/Px_report.Rmd @@ -0,0 +1,979 @@ +--- +output: + html_document: + toc: true # table of contents + toc_float: true # float the table of contents to the left of the main document content + toc_depth: 4 # header levels 1,2,3 + theme: default + number_sections: false # add section numbering to headers + df_print: paged # tables are printed as an html table with support for pagination over rows and columns + highlight: pygments + pdf_document: true + pdf_document: + toc: yes +date: "`r Sys.Date()`" +params: + meta: NULL + input_dir: NULL + artifact_dir: NULL + cpus: 1 + study_type: NULL + study_name: NULL + study_abundance_type: NULL + report_file: NULL, + report_title: NULL, + report_author: NULL, + report_description: NULL, + observations_type: NULL + observations: NULL # GSE156533.samplesheet.csv + observations_id_col: NULL + observations_name_col: NULL + check_log: NULL + features: NULL + features_type: NULL + features_id_col: NULL + features_name_col: NULL + features_metadata_cols: NULL + raw_matrix: null # e.g. 0_salmon.merged.gene_counts.tsv + normalised_matrix: null + variance_stabilised_matrix: null # e.g. test_files/3_treatment-WT-P23H.vst.tsv + contrasts_file: null # e.g. GSE156533.contrasts.csv + differential_table: file.csv + affy_cel_files_archive: NULL + affy_file_name_col: NULL + affy_background: NULL + affy_bgversion: NULL + affy_destructive: NULL + affy_cdfname: NULL + affy_rm_mask: NULL + affy_rm_outliers: NULL + affy_rm_extra: NULL + affy_build_annotation: NULL + limma_ndups: NULL + limma_spacing: NULL + limma_block: NULL + limma_correlation: NULL + limma_method: NULL + limma_proportion: NULL + limma_stdev_coef_lim: NULL + limma_trend: NULL + limma_robust: NULL + limma_winsor_tail_p: NULL + limma_adjust_method: NULL + limma_p_value: NULL + limma_lfc: NULL + limma_confint: NULL + exploratory_n_features: null + exploratory_clustering_method: null + exploratory_cor_method: null + exploratory_whisker_distance: null + exploratory_mad_threshold: null + exploratory_main_variable: null + exploratory_assay_names: NULL + exploratory_final_assay: NULL + exploratory_palette_name: NULL + versions_file: null # e.g 17_software_versions.yml + logo: null + css: null + citations: null + filtering_min_samples: 1 + filtering_min_abundance: 1 + filtering_min_proportion: NULL + filtering_grouping_var: NULL + differential_file_suffix: NULL + differential_feature_id_column: NULL + differential_feature_name_column: NULL + differential_fc_column: NULL + differential_pval_column: NULL + differential_qval_column: NULL + differential_min_fold_change: NULL + differential_foldchanges_logged: NULL + differential_max_pval: NULL + differential_max_qval: NULL + differential_palette_name: NULL + differential_subset_to_contrast_samples: NULL + deseq2_test: NULL + deseq2_fit_type: NULL + deseq2_sf_type: NULL + deseq2_min_replicates_for_replace: NULL + deseq2_use_t: NULL + deseq2_lfc_threshold: NULL + deseq2_alt_hypothesis: NULL + deseq2_independent_filtering: NULL + deseq2_p_adjust_method: NULL + deseq2_alpha: NULL + deseq2_minmu: NULL + deseq2_vs_method: NULL + deseq2_shrink_lfc: NULL + deseq2_cores: NULL + deseq2_vs_blind: NULL + deseq2_vst_nsub: NULL + gsea_run: false + gsea_nperm: NULL + gsea_permute: NULL + gsea_scoring_scheme: NULL + gsea_metric: NULL + gsea_sort: NULL + gsea_order: NULL + gsea_set_max: NULL + gsea_set_min: NULL + gsea_norm: NULL + gsea_rnd_type: NULL + gsea_make_sets: NULL + gsea_median: NULL + gsea_num: NULL + gsea_plot_top_x: NULL + gsea_rnd_seed: NULL + gsea_save_rnd_lists: NULL + gsea_zip_report: NULL + gsea_chip_file: NULL + gsea_gene_sets: NULL +--- + + + +```{r, include=FALSE} +library(knitr) +library(yaml) +library(shinyngs) +library(plotly) +library(ggplot2) +library(DT) +library(dplyr) + +# TODO +#mulled-build --use-mamba build 'r-base=4.2.3,r-rmarkdown=2.21,r-yaml=2.3.7,bioconductor-enhancedvolcano=1.16.0,anaconda::gmp=6.2.1,conda-forge::r-ggplot2=3.4.2,conda-forge::r-upsetr' +#mulled-build --use-mamba build 'r-base,r-rmarkdown,r-yaml,bioconductor-enhancedvolcano,anaconda::gmp,conda-forge::r-ggplot2,conda-forge::r-upsetr' +#conda-forge::r-base conda-forge::r-rmarkdown conda-forge::r-yaml bioconda::bioconductor-enhancedvolcano anaconda::gmp conda-forge::r-ggplot2 #conda-forge::r-upsetr +#/home-link/iivow01/tools/mambaforge/bin/python3.10 +#/home-link/iivow01/tools/mambaforge/envs/mulled/lib/python3.11/site-packages/galaxy/tool_util/deps/mulled +#galaxy.tool_util.deps.mulled.mulled_build +``` + +```{r include = FALSE} +# Load the datatables js +datatable(NULL) +``` + +```{r, include=FALSE} +versions <- unlist(yaml.load_file(file.path(params$input_dir, params$versions_file)), recursive = FALSE) +params_table <- data.frame(Parameter = names(unlist(params)), Value = unlist(params), row.names = NULL) + +# We'll subset the params table for different report sections +make_params_table <- function(name, pattern = NULL, remove_pattern = FALSE){ + subparams <- params_table + if (! is.null(pattern)){ + subparams <- subparams[grep(pattern, subparams$Parameter),] + } + if (remove_pattern){ + subparams$Parameter <- sub(pattern, '', subparams$Parameter) + } + + if (nrow(subparams) > 10){ + dom <- 'tp' + }else{ + dom <- 't' + } + + print( htmltools::tagList(datatable(subparams, caption = paste("Parameters used for", name), rownames = FALSE, options = list(dom = dom)) )) +} + +report_title <- paste0('Differential ', params$features_type, ' abundance report', ifelse(is.null(params$report_title), '', paste0(': ', params$report_title))) +report_subtitle <- paste0(ifelse(is.null(params$report_author), '', paste0('By ', params$report_author, ', ')), 'differentialabundance workflow version', versions[["Workflow.nf-core/differentialabundance"]]) +``` + +--- +title: "`r report_title`" +subtitle: `r report_subtitle` +--- + + + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + + + +```{r, echo=FALSE} +htmltools::includeCSS(params$css) +``` + +```{r results="asis", echo=FALSE} +cat(paste0(" + +")) +``` + + + +```{r, echo=FALSE} +observations <- read_metadata(file.path(params$input_dir, params$observations), id_col = params$observations_id_col) +if (! params$observations_name_col %in% colnames(observations)){ + stop(paste('Invalid observation name column specified: ', params$observations_name_col, paste0('(Valid values are: ', paste(colnames(observations), collapse=', '),')'))) +} + +if (! is.null(params$features)){ + features <- read_metadata(file.path(params$input_dir, params$features)) + features <- features[,colnames(features) %in% simpleSplit(params$features_metadata_cols), drop = FALSE] +} + +contrasts <- read_metadata(file.path(params$input_dir, params$contrasts_file)) +contrasts$blocking <- na.replace(contrasts$blocking, '') +if (! 'id' %in% colnames(contrasts)){ + contrasts$id <- apply(contrasts, 1, paste, collapse='_') +} + +# Identify informative variables- those with a number of values greater than 1 +# but less than N, with N being the number of observations. Make sure contrast +# variables are first in the list + +informative_variables <- unique(c(contrasts$variable, chooseGroupingVariables(observations))) + +# Remove any informative variables that group observations the same way +informative_variables <- informative_variables[ ! duplicated(lapply(structure(informative_variables, names= informative_variables), function(x) as.numeric(factor(observations[[x]], levels=unique(observations[[x]])))))] + +assay_names <- simpleSplit(params$exploratory_assay_names) +names(assay_names) = assay_names +assay_files <- lapply(assay_names, function(x) params[[paste0(x, '_matrix')]]) +capture.output(assay_files, file="/home-link/iivow01/git/differentialabundance/error/assay_files") + +assay_data <- lapply(assay_files, function(x) { + mat <- read_matrix( + x, + sample_metadata = observations, + row.names = 1 + ) + colnames(mat) <- observations[[params$observations_name_col]][match(colnames(mat), rownames(observations))] + + # Bit hacky, but ensure log + if (params$check_log && max(mat, na.rm=T) > 20){ + log2(mat+1) + }else{ + mat + } +}) + +# Now we can rename the observations rows using the title field +rownames(observations) <- observations[[params$observations_name_col]] + +# Run PCA early so we can understand how important each variable is + +pca_datas <- lapply(names(assay_data), function(assay_type){ + compilePCAData(assay_data[[assay_type]]) +}) +names(pca_datas) <- names(assay_data) + +pca_vs_meta <- anova_pca_metadata(pca_datas[[params$exploratory_final_assay]]$coords, observations[,informative_variables, drop = FALSE], pca_datas[[params$exploratory_final_assay]]$percentVar) + +# Show the variable with the tightest PC associations first +informative_variables <- rownames(pca_vs_meta)[order(pca_vs_meta[,1])] + +# Pick the variable used for coloring purposes etc +if (params$exploratory_main_variable == 'contrasts'){ + main_grouping_variable <- contrasts$variable[1] +}else if (params$exploratory_main_variable == 'auto_pca'){ + main_grouping_variable <- informative_variables[1] +}else{ + if (! params$exploratory_main_variable %in% colnames(observations)){ + stop(paste('Invalid main variable specified: ', params$exploratory_main_variable)) + } + main_grouping_variable <- params$exploratory_main_variable +} + +# Make sure the main variable is shown first, with remaining shown in order of +# informativeness + +informative_variables <- unique(c(main_grouping_variable, informative_variables)) + +groupColorScale <- makeColorScale(length(unique(observations[[main_grouping_variable]])), palette = params$exploratory_palette_name) +``` + + + +```{r, echo=FALSE} + +prefix_part_names <- c('variable', 'reference', 'target', 'blocking') +diff_prefixes <- sub('-$', '', apply(contrasts[,prefix_part_names], 1, function(x) paste(x, collapse = '-'))) + +differential_files <- lapply(diff_prefixes, function(d){ + file.path(params$input_dir, paste0(gsub(' |;', '_', d), params$differential_file_suffix)) +}) + +differential_results <- lapply(differential_files, function(diff_file){ + if (! file.exists(diff_file)){ + stop(paste("Differential file", diff_file, "does not exist")) + } + diff <- read_differential( + diff_file, + feature_id_column = params$differential_feature_id_column, + fc_column = params$differential_fc_column, + pval_column = params$differential_pval_column, + qval_column = params$differential_qval_column + ) + + # If fold changes are not logged already, log them (we assume they're logged + # later on) + + if (! params$differential_foldchanges_logged){ + diff[[params$differential_fc_column]] <- log2(diff[[params$differential_fc_column]]) + } + + # Annotate differential tables if possible + + if (! is.null(params$features)){ + diff <- merge(features, diff, by.x = params$features_id_col, by.y = params$differential_feature_id_column) + } + diff +}) +names(differential_results) <- diff_prefixes +``` + + + +```{r, echo=FALSE} + +contrast_descriptions <- paste(contrasts$target, 'versus', contrasts$reference, 'in', contrasts$variable) +with_blocking <- which(contrasts$blocking != '') +contrast_descriptions[with_blocking] <- paste0(contrast_descriptions[with_blocking], " (blocking on ", contrasts$blocking[with_blocking],")") + +# Check both adjusted and unadjusted p values + +p_value_types <- list(Adjusted = params$differential_qval_column, Unadjusted = params$differential_pval_column) +p_value_thresholds <- list(Adjusted = params$differential_max_qval, Unadjusted = params$differential_max_pval) + +sig_differential <- + lapply(names(p_value_types), function(pvt){ + diff <- lapply( + 1:nrow(contrasts), + function(x){ + signif <- differential_results[[x]][,p_value_types[[pvt]] ] < p_value_thresholds[[pvt]] + list( + up = differential_results[[x]][which( + differential_results[[x]][,params$differential_fc_column ] > log2(params$differential_min_fold_change) & + signif + ),], + down = differential_results[[x]][which( + differential_results[[x]][,params$differential_fc_column ] < log2(1/params$differential_min_fold_change) & + signif + ),] + ) + } + ) + names(diff) <- contrast_descriptions + diff + }) +names(sig_differential) <- names(p_value_types) + +# Count the differential genes +differential_tables <- lapply(names(sig_differential), function(sd) do.call(rbind, lapply(sig_differential[[sd]], function(x) lapply(x, function(y) nrow(y))))) +names(differential_tables) <- names(sig_differential) +``` + + + +# Abstract + +This report summarises differential `r params$features_type` analysis as performed by the nf-core/differentialabundance pipeline. + +# Data + +```{r, echo=FALSE, results='asis'} +cat(paste0("\n## ", ucfirst(params$observations_type), "s\n")) +``` + + +A summary of `r params$observations_type` metadata is below: + +```{r, echo=FALSE, results='asis'} +display_columns <- union(c(params$observations_id_col, unique(contrasts$variable)), informative_variables) +minimal_fetchngs_cols <- c('sample', 'sample_title', 'strandedness', 'library_strategy', 'scientific_name') + +# If the data came via fetchngs then we can infer a couple of things about the most useful columns + +if (all(minimal_fetchngs_cols %in% colnames(observations))){ + additional_useful_cols <- minimal_fetchngs_cols +}else{ + additional_useful_cols <- colnames(observations)[which(apply(observations, 2, function(x) max(nchar(x))) <= 20)] +} + +display_columns <- head(union(display_columns, additional_useful_cols), 5) + +# Also add informative columns +display_columns <- unique(c(display_columns, informative_variables)) +observations_to_print <- observations[,unique(display_columns)] +colnames(observations_to_print) <- prettifyVariablename(colnames(observations_to_print)) +print( htmltools::tagList(datatable(observations_to_print, caption = paste(ucfirst(params$observations_type), 'metadata'), rownames = FALSE, options = list(dom = 't')) )) + +``` + +## Contrasts + +Comparisons were made between `r params$observations_type` groups defined using using `r params$observation_type` metadata columns, as described in the following table of contrasts: + +```{r, echo=FALSE, results='asis'} +contrasts_to_print <- contrasts +colnames(contrasts_to_print) <- prettifyVariablename(colnames(contrasts_to_print)) +print( htmltools::tagList(datatable(contrasts_to_print, caption = paste0("Table of contrasts"), rownames = FALSE, options = list(dom = 't')) )) +``` + +# Results + +## Counts + +Input was a matrix of `r nrow(assay_data$raw)` `r params$features_type`s for `r ncol(assay_data$raw)` `r params$observations_type`s`r ifelse(nrow(assay_data$normalised) < nrow(assay_data$raw), paste0(', reduced to ', nrow(assay_data$normalised), ' ', params$features_type, 's after filtering for low abundance'), '')`. + +## Exploratory analysis + +### Abundance value distributions + +The following plots show the abundance value distributions of input matrices. A log2 transformation is applied where not already performed. + +```{r, include=FALSE} + +``` +#### Box plots {.tabset} + +```{r, echo=FALSE, results='asis', fig.height=8} +for (a in names(assay_data)) { + cat(paste0("\n##### ", prettifyVariablename(a), "\n")) + p <- ggplot_boxplot( + assay_data[[a]], + experiment = observations, + colorby = main_grouping_variable, + expressiontype = paste("count per", params$features_type), + palette = groupColorScale, + whisker_distance = params$exploratory_whisker_distance, + base_size=8 + ) + print(p) + cat("\n") +} +``` + +Whiskers in the above boxplots show `r params$exploratory_whisker_distance` times the inter-quartile range. + +#### Density plots + +```{r, echo=FALSE, results='asis', fig.height=8} +plotly_densityplot( + assay_data, + experiment = observations, + colorby = params$observations_name_col, + expressiontype = paste("count per", params$features_type), + makeColorScale(length(unique(observations[[params$observations_id_col]])), palette = "Set1") +) +``` + +```{r, echo=FALSE, results='asis'} +cat(paste0("\n### ", ucfirst(params$observations_type), " relationships\n")) +``` + +#### Principal components plots + +Principal components analysis was conducted based on the `r params$exploratory_n_features` most variable `r params$features_type`s. Each component was annotated with its percent contribution to variance. + +```{r, echo=FALSE, results='asis'} +for (assay_type in rev(names(assay_data))){ + + pca_data <- pca_datas[[assay_type]] + + for (iv in informative_variables){ + + cat(paste0("\n##### ", prettifyVariablename(assay_type), " (", iv, ")\n")) + + plotdata <- pca_data$coords + plotdata$colorby <- factor( + observations[[iv]], + levels = unique(observations[[iv]]) + ) + pcaColorScale <- makeColorScale(length(unique(observations[[iv]])), palette = params$exploratory_palette_name) + + # Make plotting data combining PCA coords with coloring groups etc + + plotdata$name <- rownames(plotdata) + percentVar <- pca_data$percentVar + labels <- paste0(colnames(plotdata), " (", sprintf("%.1f", percentVar), "%)") + ncats <- length(unique(plotdata$colorby)) + + plot_types <- list("2" = "scatter", "3" = "scatter3d") + + for (d in names(plot_types)) { + + # Default plot args whatever we're doing + + plot_args <- list( + x = pca_data$coords[, 1], + y = pca_data$coords[, 2], + xlab = labels[1], + ylab = labels[2], + colorby = plotdata$colorby, + plot_type = plot_types[[d]], + palette = pcaColorScale, + legend_title = prettifyVariablename(iv), + labels = plotdata$name, + show_labels = TRUE + ) + if (d == "3") { + plot_args$z <- pca_data$coords[, 3] + plot_args$zlab <- labels[3] + } + + print(htmltools::tagList(do.call("plotly_scatterplot", plot_args))) + } + } +} +``` + + +#### Scree plots {.tabset} + +The scree plot below shows the proportion of variance that is explained by each of the PCA components. + +```{r, echo=FALSE, results='asis', message=F} +for (assay_type in rev(names(assay_data))){ + imp <- summary(pca_datas[[assay_type]]) + cat(paste0("\n##### ", prettifyVariablename(assay_type), "\n")) + + pca <- prcomp(t(na.omit((assay_data[[assay_type]]))), scale = TRUE) + imp <- t(as.data.frame(summary(pca)$importance)[2,]) + imp <- as.data.frame(cbind(PCA=rownames(imp), imp)) + colnames(imp) <- c("PCA", "POV") + + write.table(imp, file=paste0("/home-link/iivow01/git/differentialabundance/error/", assay_type, "_", "imp.tsv"), quote=F, sep="\t") + write.table(imp$PCA, file=paste0("/home-link/iivow01/git/differentialabundance/error/", assay_type, "_", "imp_data1.tsv"), quote=F, sep="\t") + write.table(imp$POV, file=paste0("/home-link/iivow01/git/differentialabundance/error/", assay_type, "_", "imp_data2.tsv"), quote=F, sep="\t") + + p <- ggplot(data=imp, aes(x=factor(PCA, level=imp$PCA), y=POV)) + geom_bar(stat="identity") + + xlab('') + + ylab('Proportion of Variance') + + theme_bw() + + theme(legend.title = element_blank()) + + theme(text = element_text(size=12)) + print(p) + cat("\n") +} +``` + +#### Principal components/ metadata associations + +For the variance stabilised matrix, an ANOVA test was used to determine assocations between continuous principal components and categorical covariates (including the variable of interest). + +The resulting p values are illustrated below. + +```{r, echo=FALSE, results='asis'} + +# This is a little hack to work around a bug in d3heatmap with single-row data +# frames. +if (nrow(pca_vs_meta) == 1){ + plot_pca_meta <- rbind(pca_vs_meta, pca_vs_meta) +}else{ + plot_pca_meta <- pca_vs_meta +} + + +write.table(-log10(plot_pca_meta), file=paste0("/home-link/iivow01/git/differentialabundance/error/plot_pca_meta_log.tsv"), quote=F, sep="\t") +write.table((plot_pca_meta), file=paste0("/home-link/iivow01/git/differentialabundance/error/plot_pca_meta.tsv"), quote=F, sep="\t") + +d3heatmap::d3heatmap( + -log10(plot_pca_meta), + Rowv = FALSE, + dendrogram = 'none', + cellnote = plot_pca_meta, + cexCol = 0.8, + cexRow = 0.8, + height = (100 + (15 * nrow(plot_pca_meta))), + colors = colorRampPalette( + rev( + RColorBrewer::brewer.pal(n = 7, name = "RdYlBu") + ) + )(100) +) + +for (variable in rownames(pca_vs_meta)){ + sig_comps <- pca_vs_meta[variable,] < 0.1 + + if (any(sig_comps)){ + min_sig_comp <- min(which(sig_comps)) + + min_sig_comp_p <- sprintf("%.2f", pca_vs_meta[variable, min_sig_comp]) + cat(paste0('The variable \'', variable, '\' shows an association with ', colnames(pca_vs_meta)[min_sig_comp], ' (p = ', min_sig_comp_p,'). ')) + } +} +``` + +#### Clustering dendrograms {.tabset} + +A hierarchical clustering of `r params$features_type`s was undertaken based on the top `r params$exploratory_n_features` most variable `r params$features_type`s. Distances between `r params$features_type`s were estimated based on `r params$exploratory_cor_method` correlation, which were then used to produce a clustering via the `r params$exploratory_clustering_method` method with `hclust()` in R. + +```{r, echo=FALSE, results='asis'} +for (assay_type in rev(names(assay_data))){ + for (iv in informative_variables){ + cat(paste0("\n##### ", prettifyVariablename(assay_type), " (", iv, ")\n")) + capture.output(assay_data[[assay_type]], file=paste0("/home-link/iivow01/git/differentialabundance/error/selectstuff_", assay_type, "_", iv)) + capture.output(assay_type, file=paste0("/home-link/iivow01/git/differentialabundance/error/type")) + variable_genes <- selectVariableGenes(matrix = assay_data[[assay_type]], ntop = params$exploratory_n_features) + capture.output(assay_data[[assay_type]][variable_genes, ], file=paste0("/home-link/iivow01/git/differentialabundance/error/dendrostuff_", assay_type, "_", iv)) + + dendroColorScale <- makeColorScale(length(unique(observations[[iv]])), palette = params$exploratory_palette_name) + capture.output("assay_data[[assay_type]][variable_genes, ]", file=paste0("/home-link/iivow01/git/differentialabundance/error/dendrostuff_")) + p <- clusteringDendrogram( + 2^assay_data[[assay_type]][variable_genes, ], + observations[, iv, drop = FALSE], + colorby = iv, + cor_method = params$exploratory_cor_method, + plot_title = paste0( + paste0(params$observations_type," clustering dendrogram, "), + params$exploratory_n_features, + " most variable ", + params$features_type, + "s\n(", params$exploratory_clustering_method, " clustering, ", params$exploratory_cor_method, " correlation)"), + cluster_method = params$exploratory_clustering_method, + palette = dendroColorScale, + labelspace = 0.25 + ) + # Defaults in shinyngs make the text in this plot a bit big for the report, so + # scale it down a bit + print(p, vp=grid::viewport(gp=grid::gpar(cex=0.7))) + cat("\n") + } +} +``` + +### Outlier detection {.tabset} + +Outlier detection based on [median absolute deviation](https://wiki.arrayserver.com/wiki/index.php?title=CorrelationQC.pdf) was undertaken, the outlier scoring is plotted below. + +```{r, echo=FALSE, results='asis', warning=FALSE} + +# We can't look for ouliers in sets of less than 3 samples, so exclude variables +# unless the minimum group size is larger than that +iv_min_group_sizes <- unlist(lapply(informative_variables, function(x) min(table(observations[[x]])))) + +foo <- lapply(informative_variables[iv_min_group_sizes > 2], function(iv){ + + cat(paste("\n####", iv, "\n")) + + plotdata <- + madScore( + matrix = assay_data[[params$exploratory_final_assay]], + sample_sheet = observations, + groupby = iv + ) + + if (! is.null(plotdata)){ + mad_plot_args <- list( + x = plotdata$group, + y = plotdata$mad, + color = plotdata$outlier, + hline_thresholds = c("Outlier threshold" = params$exploratory_mad_threshold), + palette = makeColorScale(2, palette = params$differential_palette_name), + legend_title = "Outlier status", + labels = rownames(plotdata), + show_labels = TRUE, + xlab = "Sample group", + ylab = "MAD score" + ) + + print(htmltools::tagList(do.call("plotly_scatterplot", mad_plot_args))) + + outliers <- rownames(plotdata)[plotdata$outlier] + + if (length(outliers) == 0){ + cat(paste0("No outlying samples were detected in groups defined by ", iv,".\n")) + }else{ + cat(paste0(length(outliers), ' possible outliers were detected in groups defined by ', iv ,': ', paste(outliers, collapse=', '), "\n")) + } + } +}) + +``` + +## Differential analysis + +### Differential `r params$features_type` `r params$study_abundance_type` {.tabset} + +```{r, echo=FALSE, results='asis'} +foo <- lapply(names(p_value_types), function(pvt){ + cat("\n#### ", pvt, "\n") + print( htmltools::tagList(datatable(differential_tables[[pvt]], caption = paste0('Differential ', params$features_type, " ", params$abundance_type, ' (target relative to reference)'), options = list(dom = 't'), rownames = TRUE) )) + cat("\n") +}) +``` + +```{r, echo=FALSE, results='asis', eval = FALSE} + +differential_summary_string <- paste( + paste( + lapply( + 1:nrow(contrasts), + function(x){ + paste0( + "Contrast ", x, ' (', contrast_descriptions[x], ') ', "had ", differential_table[x,'up'], ' ', paste0(params$features_type, 's'), ' expressed significantly more highly in ', contrasts[x, 'target',], ' than ', contrasts[x, 'reference',], ' and ', differential_table[x,'down'], ' expressed at sifnificantly lower levels.' + ) + } + ), + collapse = ' ' + ) +) +cat(differential_summary_string) +``` + +### Volcano plots + +```{r, echo=FALSE, results='asis'} + +# Set up palette of 4 colors +volcano_palette <- colorRampPalette(colors = c("gray", "green", "blue", "red"))(4) + +# Two functions to add vertical/horizontal lines to the volcano plot +vline <- function(x = 0, color = "black") { + list( + type = "line", + y0 = 0, + y1 = 1, + yref = "paper", + x0 = x, + x1 = x, + line = list(color = color, dash="dot") + ) +} +hline <- function(y = 0, color = "black") { + list( + type = "line", + x0 = 0, + x1 = 1, + xref = "paper", + y0 = y, + y1 = y, + line = list(color = color, dash="dot") + ) +} + +for (i in 1:nrow(contrasts)){ + cat("\n#### ", contrast_descriptions[i], "\n") + + ## Make a volcano plot for the contrast first + + # Label features with symbol as well as identifier + if (! is.null(params$features) && (! is.null(params$differential_feature_name_column)) ){ + label_col <- params$differential_feature_name_column + }else{ + label_col <- params$differential_feature_id_column + } + + # Get the full set of differential stats for this contrast, removing rows with + # NAs in the fields we need. + full_de <- differential_results[[i]] + full_de <- subset(full_de, (! is.na(full_de[[params$differential_fc_column]])) & (! is.na(full_de[[params$differential_qval_colum]])) ) + #full_de[[params$differential_fc_column]] <- -log2(full_de[[params$differential_fc_column]]) + #full_de[[params$differential_pval_column]] <- -log10(full_de[[params$differential_pval_column]]) + + #full_de$color <- with(full_de, ifelse(params$differential_fc_column>=params$differential_min_fold_change & params$differential_pval_column<=params$differential_max_qval, 4, ifelse(params$differential_fc_column>=params$differential_min_fold_change, 2, ifelse(params$differential_pval_column<=params$differential_max_qval, 3, 1)))) + + full_de <- full_de + full_de$color <- 1 # default (black) + full_de$color[abs(full_de[[params$differential_fc_column]]) >= log2(params$differential_min_fold_change)] <- 2 # high FC (green) + full_de$color[full_de[[params$differential_fc_column]] >= params$differential_max_qval] <- 3 # low p val (blue) + full_de$color[abs(full_de[[params$differential_fc_column]]) >= log2(params$differential_min_fold_change) & full_de[[params$differential_fc_column]] >= params$differential_max_qval] <- 4 # high FC & low p val (red) + + write.table(full_de, file="/home-link/iivow01/git/differentialabundance/error/full_de.tsv", quote=F, sep="\t") + + # We'll color by whether features are differential according to supplied thresholds + + p_value_types <- list(Adjusted = params$differential_qval_column, Unadjusted = params$differential_pval_column) + p_value_thresholds <- list(Adjusted = params$differential_max_qval, Unadjusted = params$differential_max_pval) + + +for (pvt in names(p_value_types)){ + cat("\n##### ", pvt, " p values\n") + pval_column <- p_value_types[[pvt]] + + full_de$differential_status <- FALSE + full_de$differential_status[abs(full_de[[params$differential_fc_column]]) > log2(params$differential_min_fold_change) & full_de[[pval_column]] < p_value_thresholds[[pvt]]] <- TRUE + + # Define the thresholds we'll draw + + hline_thresholds = vline_thresholds = list() + hline_thresholds[[paste(pval_column, '=', p_value_thresholds[[pvt]])]] = -log10(p_value_thresholds[[pvt]]) + vline_thresholds[[paste(params$differential_fc_column, '<-', log2(params$differential_min_fold_change))]] = -log2(params$differential_min_fold_change) + vline_thresholds[[paste(params$differential_fc_column, '>', log2(params$differential_min_fold_change))]] = log2(params$differential_min_fold_change) + + write.table(full_de, file="/home-link/iivow01/git/differentialabundance/error/full_de.tsv", sep="\t", quote=F) + write.table(full_de[[pval_column]][full_de$color==1], file="/home-link/iivow01/git/differentialabundance/error/full_de_sub.tsv", sep="\t", quote=F) + capture.output(full_de$color, file="/home-link/iivow01/git/differentialabundance/error/colorbla.tsv") + + write.table(full_de$color, file="/home-link/iivow01/git/differentialabundance/error/color.tsv", sep="\t", quote=F) + plot_args <- list(type = "scatter", mode = 'markers') + + # Let's equalize the axes + max_fc <- max(abs(full_de[[params$differential_fc_column]])) * 1.1 + p <- do.call(plot_ly, plot_args) %>% + layout(xaxis = list(range=list(-max_fc, max_fc), + title = paste("higher in", contrasts$reference[i], " <<", params$differential_fc_column, ">> higher in", contrasts$target[i])) + # shapes = list( + # hline(-log10(p_value_thresholds[[pvt]])), + # hline(log10(p_value_thresholds[[pvt]])), + # vline(log2(params$differential_min_fold_change)) + # ) + ) %>% + add_trace(mode = "markers", name = "Not significant", x = full_de[[params$differential_fc_column]][full_de$color==1], y = -log10(full_de[[pval_column]][full_de$color==1]), marker = list(color = "black")) %>% + add_trace(mode = "markers", name = paste0("abs(", params$differential_fc_column, ")>", params$differential_min_fold_change), x = full_de[[params$differential_fc_column]][full_de$color==2], y = -log10(full_de[[pval_column]][full_de$color==2]), marker = list(color = "green")) %>% + add_trace(mode = "markers", name = paste0(pval_column, "<=", params$differential_max_qval), x = full_de[[params$differential_fc_column]][full_de$color==3], y = -log10(full_de[[pval_column]][full_de$color==3]), marker = list(color = "blue")) %>% + add_trace(mode = "markers", name = paste0("abs(", params$differential_fc_column, ")>", params$differential_min_fold_change, "\n& ", pval_column, "<=", params$differential_max_qval), x = full_de[[params$differential_fc_column]][full_de$color==4], y = -log10(full_de[[pval_column]][full_de$color==4]), marker = list(color = "red")) + +# differential_status[abs(full_de[[params$differential_fc_column]]) > log2(params$differential_min_fold_change) & full_de[[pval_column]] < p_value_thresholds[[pvt]]] + + + + print(htmltools::tagList(p)) + + + + + ## ... then show tables of the up/ down genes + + for (dir in c('up', 'down')){ + contrast_de <- sig_differential[[pvt]][[i]][[dir]] + cols_to_round <- c(params$differential_fc_column, params$differential_pval_column, params$differential_qval_column) + contrast_de[, cols_to_round] <- signif(contrast_de[, cols_to_round], 8) + + colnames(contrast_de) <- prettifyVariablename(colnames(contrast_de)) + + if (nrow(contrast_de) > 0){ + print( htmltools::tagList(datatable(contrast_de, caption = paste('Differential genes', dir, 'in', contrast_descriptions[i], " (check", differential_files[[i]], "for more detail)"), rownames = FALSE) )) + }else{ + cat(paste0("No significantly differential '", dir, "' genes.\n\n")) + } + } + } + +} +``` + + + +```{r, echo=FALSE, results='asis'} +possible_gene_set_methods <- c('gsea') +if (any(unlist(params[paste0(possible_gene_set_methods, '_run')]))){ + cat("\n### Gene set analysis\n") + + for (gene_set_method in possible_gene_set_methods){ + if (unlist(params[paste0(gene_set_method, '_run')])){ + cat("\n### ", toupper(gene_set_method) ," {.tabset}\n") + + for (gmt_file in simpleSplit(params$gsea_gene_sets)) { + gmt_name <- basename(tools::file_path_sans_ext(gmt_file)) + + cat("\n#### ", gmt_name ," {.tabset}\n") + reference_gsea_tables <- paste0(contrasts$id, ".", gmt_name, '.gsea_report_for_', contrasts$reference, '.tsv') + target_gsea_tables <- paste0(contrasts$id, ".", gmt_name, '.gsea_report_for_', contrasts$target, '.tsv') + + for (i in 1:nrow(contrasts)){ + cat("\n##### ", contrast_descriptions[i], "\n") + + target_gsea_results <- read_metadata(target_gsea_tables[i])[,c(-2,-3)] + print( htmltools::tagList(datatable(target_gsea_results, caption = paste0("\nTarget (", contrasts$target[i], ")\n"), rownames = FALSE) )) + + ref_gsea_results <- read_metadata(reference_gsea_tables[i])[,c(-2,-3)] + print( htmltools::tagList(datatable(ref_gsea_results, caption = paste0("\nReference (", contrasts$reference[i], ")\n"), rownames = FALSE) )) + } + } + } + } +} +``` + +# Methods + +## Filtering + +```{r, echo=FALSE, results='asis'} +make_params_table('feature-wise filtering', 'filtering_', remove_pattern = TRUE) +``` + +```{r, echo=FALSE, results='asis'} +filtering_string <- paste0('Filtering was carried out by selecting ', params$features_type, 's with an abundance of at least ', params$filtering_min_abundance) + +if (is.null(params$filtering_grouping_var)){ + if (is.null(params$filtering_min_proportion)){ + filtering_string <- paste0(filtering_string, ' in at least ', params$filtering_min_samples, ' ', params$observations_type, 's.') + }else{ + filtering_string <- paste0(filtering_string, ' in at least a proportion of ', params$filtering_min_proportion, ' of ', params$observations_type,'s.') + } +}else{ + if (is.null(params$filtering_min_proportion)){ + filtering_string <- paste0(filtering_string, ' in at least the number of ', params$observations_type, 's corresponding to the smallest group size defined by the grouping variable "', params$filtering_grouping_var, '".') + }else{ + filtering_string <- paste0(filtering_string, ' in at least a proportion of ', params$filtering_min_proportion, ' of the number of ', params$observations_type,'s corresponding to the smallest group size defined by the grouping variable"', params$filtering_grouping_var, '".') + } +} +cat(filtering_string) +``` + +## Exploratory analysis + +```{r, echo=FALSE, results='asis'} +make_params_table('exploratory analysis', 'exploratory_', remove_pattern = TRUE) +``` + +## Differential analysis + +```{r, echo=FALSE, results='asis'} +if (params$study_type == 'rnaseq'){ + make_params_table('DESeq2', 'deseq2_', remove_pattern = TRUE) +} +make_params_table('downstream differential analysis', 'differential_', remove_pattern = TRUE) +``` + + + +```{r, echo=FALSE, results='asis'} +possible_gene_set_methods <- c('gsea') + +if (any(unlist(params[paste0(possible_gene_set_methods, '_run')]))){ + cat("\n### Gene set analysis\n") + + for (gene_set_method in possible_gene_set_methods){ + if (unlist(params[paste0(gene_set_method, '_run')])){ + cat("\n### ", toupper(gene_set_method) ," {.tabset}\n") + make_params_table(toupper(gene_set_method), paste0(gene_set_method, '_'), remove_pattern = TRUE) + } + } + +} +``` + +# Appendices + +## All parameters + +```{r, echo=FALSE, results='asis'} +print( htmltools::tagList(datatable(params_table, caption = "All parameters", rownames = FALSE) )) +``` + +## Software versions + +**Note:** For a more detailed accounting of the software and commands used (including containers), consult the execution report produced as part of the 'pipeline info' for this workflow. + +```{r, echo=FALSE, results='asis'} +versions_table <- data.frame(do.call(rbind, strsplit(names(versions), split = '\\.')), unlist(versions)) +colnames(versions_table) <- c('Component', 'Software', 'Version') +print( htmltools::tagList(datatable(versions_table, caption = "Software versions", rownames = FALSE, options = list(dom = 'ft', paging = FALSE)) )) +``` + +```{r, echo=FALSE, results='asis'} +htmltools::includeMarkdown(params$citations) +``` diff --git a/modules/nf-core/proteus/main.nf b/modules/nf-core/proteus/main.nf index 3d977ea5..b00370c9 100644 --- a/modules/nf-core/proteus/main.nf +++ b/modules/nf-core/proteus/main.nf @@ -6,7 +6,7 @@ process PROTEUS { // conda "bioconda::r-proteus-bartongroup=0.2.16 conda-forge::r-plotly=4.10.1 bioconda::bioconductor-limma=3.54.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/r-proteus-bartongroup:0.2.16--r42hdfd78af_0' : - 'quay.io/biocontainers/mulled-v2-315db18c8d78a415a01c6264de61a7063523d1a0:e1c1e17f1fcd8a42a94770f3ebe242c6715270f8-0' }" + 'quay.io/biocontainers/mulled-v2-0ad0abd3e3e02e24e1626edaef6d6f9a967733fb:246a3b59c610a9cd35bdf8110be3a908e3769ae0-0' }" input: tuple val(meta), path(samplesheet), path(quants) diff --git a/modules/nf-core/proteus/templates/proteus_readproteingroups.R b/modules/nf-core/proteus/templates/proteus_readproteingroups.R index b7bc7e6b..8f430cd5 100755 --- a/modules/nf-core/proteus/templates/proteus_readproteingroups.R +++ b/modules/nf-core/proteus/templates/proteus_readproteingroups.R @@ -3,53 +3,7 @@ -customreadEvidenceFile <- function(file, measure.cols=measureColumns, data.cols=evidenceColumns, zeroes.are.missing=TRUE) { - - columns <- c(data.cols, measure.cols) - #write(columns, file="/home/iivow01/git/differentialabundance/error/columns") - write(anyDuplicated(names(columns)), file="/home/iivow01/git/differentialabundance/error/wiebitte", append=F) - write(((columns)), file="/home/iivow01/git/differentialabundance/error/wiebitte", append=T) - if ("\n" %in% names(columns)) { - write("wat", file="/home/iivow01/git/differentialabundance/error/wat") - } - if(anyDuplicated(names(columns))){ - dupcols <- names(columns)[duplicated(names(columns))] - err <- paste("Column names must be unique. Got the following duplicate columns:", dupcols) - write(err, file="/home/iivow01/git/differentialabundance/error/err") - write( names(columns), file="/home/iivow01/git/differentialabundance/error/namescolumns") - #stop(err) - } - - # check if all required columns are in the evidence file - evi.cols <- read.delim(file, header=TRUE, sep="\t", check.names=FALSE, as.is=TRUE, strip.white=TRUE, nrows = 1) - missing <- NULL - for(col in columns) { - if(!(col %in% colnames(evi.cols))) missing <- c(missing, paste0("'", col, "'")) - } - if(!is.null(missing)) - stop(paste0("Column(s) ", paste0(missing, collapse=", "), " not found in file ", file)) - - # read and process evidence file - evi <- read.delim(file, header=TRUE, sep="\t", check.names=FALSE, as.is=TRUE, strip.white=TRUE) - evi <- evi[, as.character(columns)] - names(evi) <- names(columns) - # replace NaNs and infinites with NAs in measure columns - # the same with zeroes if flag is on - for(col in names(measure.cols)) { - x <- evi[, col] - x[is.nan(x) | is.infinite(x)] <- NA - if(zeroes.are.missing) x[x == 0] <- NA - evi[, col] <- x - } - - # remove rows that have only NAs in measure columns - not.empty <- which(rowSums(!is.na(evi[,names(measure.cols), drop=FALSE])) > 0) - evi <- evi[not.empty,] -} - - - - +# TODO: Add link to https://rdrr.io/github/bartongroup/Proteus/man/readProteinGroups.html to docu and mention the necessary columns! @@ -211,7 +165,7 @@ library(proteus) ################################################ ################################################ -## READ IN COUNTS FILE AND SAMPLE METADATA ## +## READ IN QUANTS FILE AND SAMPLE METADATA ## ################################################ ################################################ @@ -238,37 +192,15 @@ if (! opt\$sample_id_col %in% colnames(sample.sheet)){ # Add metadata columns that are necessary for proteus sample.sheet\$sample <- sample.sheet[[opt\$sample_id_col]] - -#opt\$contrast_variable <- make.names(opt\$contrast_variable) sample.sheet\$condition <- sample.sheet[[opt\$contrast_variable]] # Add prefix for proteinGroups measurement columns to the sample IDs from the sampesheet measure.cols <- setNames(paste0(opt\$measure_col_prefix, sample.sheet[[opt\$sample_id_col]]), sample.sheet[[opt\$sample_id_col]]) -#measure.cols <- paste0(opt\$measure_col_prefix, sample.sheet[[opt\$sample_id_col]]) -#names(measure.cols) <- sample.sheet[[opt\$sample_id_col]] -write(measure.cols, file="/home/iivow01/git/differentialabundance/error/measurecols") -write(names(measure.cols), file="/home/iivow01/git/differentialabundance/error/measurecolsnames") -# TODO check if this can happen for proteingroups -# Sample sheet can have duplicate rows for multiple sequencing runs, so uniqify -# before assigning row names -#sample.sheet <- sample.sheet[! duplicated(sample.sheet[[opt\$sample_id_col]]), ] -#rownames(sample.sheet) <- sample.sheet[[opt\$sample_id_col]] - -# Check that all samples specified in the input sheet are present in the quants -# table +# Check that all samples specified in the input sheet are present in the quants table missing_columns <- paste0(opt\$measure_col_prefix, sample.sheet[[opt\$sample_id_col]]) missing_columns <- missing_columns[!missing_columns %in% colnames(quant.table)] -#missing_samples <- -# (sample.sheet[[opt\$sample_id_col]])[!missing_columns %in% colnames(quant.table)] - -# TODO: Consider if this auto-filter should be kept or removed (probably removed, otherwise I also have to deal with makenames) -#sample.sheet <- sample.sheet[!(rownames(sample.sheet) %in% missing_samples),] - -write(missing_columns, file="/home-link/iivow01/git/differentialabundance/error2/samplecols") -#write(missing_samples, file="/home-link/iivow01/git/differentialabundance/error2/missing_samples") - if (length(missing_columns) > 0) { stop(paste( length(missing_columns), @@ -277,11 +209,6 @@ if (length(missing_columns) > 0) { 'column in quant table. The following columns are missing:', paste(missing_columns, collapse = ', ') )) -} else { - # Save any non-quant data, with gene metadata etc we might need later - # TODO: Maybe just save the whole quant file? (or not; not sure the rest is ever needed) - nonquant.table <- - quant.table[, !colnames(quant.table) %in% paste0(opt\$measure_col_prefix, sample.sheet[[opt\$sample_id_col]]), drop = FALSE] } ################################################ @@ -311,12 +238,9 @@ if (length(invalid_normfuns)>0) { output_prefix <- opt\$contrast_variable -# TODO: Add link to https://rdrr.io/github/bartongroup/Proteus/man/readProteinGroups.html to docu and mention the necessary columns! -write.table(read.table(opt\$quant_file, sep="\t", header=T, check.names=F), file="/home/iivow01/git/differentialabundance/error/wtf.tsv", quote=F, sep="\t") - # Replace proteus default ID column with user param and re-set the names of the resulting object (gsub sets the names to NULL) -proteinColumns <- setNames(gsub("Majority protein IDs", opt\$protein_id_col, proteus::proteinColumns), names(proteus::proteinColumns)) +proteinColumns <- setNames(gsub("Majority protein IDs", opt\$protein_id_col, proteus::proteinColumns), names(proteus::proteinColumns)) proteinGroups <- readProteinGroups( file=opt\$quant_file, meta=sample.sheet, @@ -324,18 +248,13 @@ proteinGroups <- readProteinGroups( data.cols=proteinColumns ) -capture.output(proteinGroups, file="/home-link/iivow01/git/differentialabundance/error/proteingroups") -capture.output(str(proteinGroups), file="/home-link/iivow01/git/differentialabundance/error/proteingroupsstr") - -write("1", file="/home-link/iivow01/git/differentialabundance/error/status") +# Generate plots for all requested normalizations; also, save normalized protein groups for limma -write.table(proteinGroups\$tab, file="/home-link/iivow01/git/differentialabundance/error/tab", quote=F) -write("2", file="/home-link/iivow01/git/differentialabundance/error/status", append=T) - -# Generate plots for all requested normalizations; also, save -# normalized protein groups for limma for (normfun in normfuns) { proteinGroups.normalized <- normalizeData(proteinGroups, norm.fun = eval(parse(text=normfun))) # Proteus also accepts other norm.funs, e.g. from limma + + # Apply log2 and remove NAs as these will otherwise mess with some of the following modules + proteinGroups.normalized\$tab <- na.omit(log2(proteinGroups.normalized\$tab)) png(paste0(output_prefix, '.proteus.', normfun, '_normalised_distributions.png'), width = 5*300, height = 5*300, res = 300, pointsize = 8) @@ -363,21 +282,18 @@ for (normfun in normfuns) { ) dev.off() - - - summary <- summary(proteinGroups.normalized) - # R object for other processes to use + saveRDS(proteinGroups.normalized, file = paste0(output_prefix, '.proteus.', normfun, 'normalised_proteingroups.rds')) - # Write normalized count matrix + # Write normalized quant matrix + out_df <- data.frame( proteinGroups.normalized\$tab, check.names = FALSE ) - out_df[[opt\$protein_id_col]] = rownames(proteinGroups.normalized\$tab) - out_df <- out_df[c(opt\$protein_id_col, colnames(out_df)[colnames(out_df) != opt\$protein_id_col])] - + out_df[[opt\$protein_id_col]] <- rownames(proteinGroups.normalized\$tab) # proteus saves the IDs as rownames; make column from those + out_df <- out_df[c(opt\$protein_id_col, colnames(out_df)[colnames(out_df) != opt\$protein_id_col])] # move ID column to first position write.table( out_df, file = paste(output_prefix, 'proteus', normfun, 'normalised_proteingroups_tab', 'tsv', sep = '.'), @@ -388,13 +304,12 @@ for (normfun in normfuns) { ) } +# Process and save raw table -# Remove NAs as these will otherwise mess with some of the other modules -# TODO should I also leave the log2 here (or log10)? If so, I think I have to apply it only after doing the norms as otherwise, -# every norm table will be logged twice (or I could log the raw table and NOT the norms, but that does not work) proteinGroups\$tab <- na.omit(log2(proteinGroups\$tab)) # Generate raw distribution plot + png(paste0(output_prefix, '.proteus.raw_distributions.png'), width = 5*300, height = 5*300, res = 300, pointsize = 8) print( plotSampleDistributions(proteinGroups, title="Raw sample distributions", fill="condition", method=opt\$plotSampleDistributions_method) @@ -404,15 +319,17 @@ print( dev.off() # R object for other processes to use + saveRDS(proteinGroups, file = paste0(output_prefix, '.proteus.raw_proteingroups.rds')) -# Write raw count matrix +# Write raw quant matrix + out_df <- data.frame( proteinGroups\$tab, check.names = FALSE ) -out_df[[opt\$protein_id_col]] = rownames(proteinGroups\$tab) -out_df <- out_df[c(opt\$protein_id_col, colnames(out_df)[colnames(out_df) != opt\$protein_id_col])] +out_df[[opt\$protein_id_col]] <- rownames(proteinGroups\$tab) # proteus saves the IDs as rownames; make column from those +out_df <- out_df[c(opt\$protein_id_col, colnames(out_df)[colnames(out_df) != opt\$protein_id_col])] # move ID column to first position write.table( @@ -439,22 +356,20 @@ sink() ## VERSIONS FILE ## ################################################ ################################################ -#TODO + r.version <- strsplit(version[['version.string']], ' ')[[1]][3] limma.version <- as.character(packageVersion('limma')) plotly.version <- as.character(packageVersion('plotly')) proteus.version <- as.character(packageVersion('proteus')) -#TODO: change mparker2 -# writeLines( -# c( -# '"${task.process}":', -# paste(' r-base:', r.version), -# paste(' bioconductor-limma:', limma.version), -# paste(' r-plotly:', plotly.version), -# paste(' mparker2-proteus:', proteus.version), -# ), -# 'versions.yml') - +writeLines( + c( + '"${task.process}":', + paste(' r-base:', r.version), + paste(' bioconductor-limma:', limma.version), + paste(' r-plotly:', plotly.version), + paste(' r-proteus-bartongroup:', proteus.version) + ), +'versions.yml') ################################################ ################################################ ################################################ From 7f9feb8a44c84f438fee35af4a9d546908ea0fe7 Mon Sep 17 00:00:00 2001 From: WackerO Date: Wed, 28 Jun 2023 13:27:41 +0200 Subject: [PATCH 04/30] Cleaning up some changes --- assets/differentialabundance_report.Rmd | 9 ------ .../limma/differential/templates/limma_de.R | 3 -- modules/nf-core/proteus/main.nf | 14 ++++----- modules/nf-core/proteus/meta.yml | 5 ++-- .../templates/proteus_readproteingroups.R | 30 +++++++++---------- modules/nf-core/rmarkdownnotebook/main.nf | 1 - .../shinyngs/validatefomcomponents/main.nf | 1 - 7 files changed, 24 insertions(+), 39 deletions(-) diff --git a/assets/differentialabundance_report.Rmd b/assets/differentialabundance_report.Rmd index 8b98174c..dccf2dad 100644 --- a/assets/differentialabundance_report.Rmd +++ b/assets/differentialabundance_report.Rmd @@ -207,7 +207,6 @@ if (! params$observations_name_col %in% colnames(observations)){ } if (! is.null(params$features)){ - write(params$features, file="/home/iivow01/git/differentialabundance/error2/paramfeatures") features <- read_metadata(file.path(params$input_dir, params$features)) features <- features[,colnames(features) %in% simpleSplit(params$features_metadata_cols), drop = FALSE] } @@ -253,8 +252,6 @@ rownames(observations) <- observations[[params$observations_name_col]] # Run PCA early so we can understand how important each variable is pca_datas <- lapply(names(assay_data), function(assay_type){ - capture.output(assay_data[[assay_type]], file=paste0("/home-link/iivow01/git/differentialabundance/error/test_assaydatatypebla.txt")) #, quote=F, sep="\t" - compilePCAData(assay_data[[assay_type]]) }) names(pca_datas) <- names(assay_data) @@ -323,11 +320,6 @@ differential_results <- lapply(differential_files, function(diff_file){ } # Annotate differential tables if possible - write(colnames(features), file="/home-link/iivow01/git/differentialabundance/error2/featurescols") - write(colnames(params$features_id_col), file="/home-link/iivow01/git/differentialabundance/error2/featuresbla") - write(colnames(params$differential_feature_id_column), file="/home-link/iivow01/git/differentialabundance/error2/featuresbla", append=T) - write.table((features), file="/home-link/iivow01/git/differentialabundance/error2/featurescols.tsv", sep="\t", quote=F) - if (! is.null(params$features)){ diff <- merge(features, diff, by.x = params$features_id_col, by.y = params$differential_feature_id_column) } @@ -584,7 +576,6 @@ for (assay_type in rev(names(assay_data))){ variable_genes <- selectVariableGenes(matrix = assay_data[[assay_type]], ntop = params$exploratory_n_features) dendroColorScale <- makeColorScale(length(unique(observations[[iv]])), palette = params$exploratory_palette_name) - capture.output(assay_data[[assay_type]][variable_genes, ], file=paste0("/home-link/iivow01/git/differentialabundance/error2/dendrostuff_test_", assay_type, "_", iv)) p <- clusteringDendrogram( 2^assay_data[[assay_type]][variable_genes, ], diff --git a/modules/nf-core/limma/differential/templates/limma_de.R b/modules/nf-core/limma/differential/templates/limma_de.R index 1e0ab5a8..c6116928 100755 --- a/modules/nf-core/limma/differential/templates/limma_de.R +++ b/modules/nf-core/limma/differential/templates/limma_de.R @@ -147,7 +147,6 @@ library(limma) ## READ IN COUNTS FILE AND SAMPLE METADATA ## ################################################ ################################################ -write("0", file="/home-link/iivow01/git/differentialabundance/error2/aisdliasdlaisd") intensities.table <- read_delim_flexible( @@ -156,10 +155,8 @@ intensities.table <- row.names = opt\$probe_id_col, check.names = FALSE ) -write("1", file="/home-link/iivow01/git/differentialabundance/error2/aisdliasdlaisd") sample.sheet <- read_delim_flexible(file = opt\$sample_file) -write("2", file="/home-link/iivow01/git/differentialabundance/error2/aisdliasdlaisd") # Deal with spaces that may be in sample column opt\$sample_id_col <- make.names(opt\$sample_id_col) diff --git a/modules/nf-core/proteus/main.nf b/modules/nf-core/proteus/main.nf index b00370c9..3a22e1bb 100644 --- a/modules/nf-core/proteus/main.nf +++ b/modules/nf-core/proteus/main.nf @@ -9,19 +9,19 @@ process PROTEUS { 'quay.io/biocontainers/mulled-v2-0ad0abd3e3e02e24e1626edaef6d6f9a967733fb:246a3b59c610a9cd35bdf8110be3a908e3769ae0-0' }" input: - tuple val(meta), path(samplesheet), path(quants) + tuple val(meta), path(samplesheet), path(intensities) output: - tuple val(meta), path("*normalised_distributions.png") , emit: nonnorm_dist_plot + tuple val(meta), path("*normalised_distributions.png") , emit: raw_dist_plot tuple val(meta), path("*normalised_distributions.png") , emit: norm_dist_plot tuple val(meta), path("*mean_variance_relationship.png") , emit: mean_var_relationship_plot tuple val(meta), path("*dendrogram.png") , emit: dendro_plot - tuple val(meta), path("*raw_proteingroups.rds") , emit: rdata - tuple val(meta), path("*raw_proteingroups_tab.tsv") , emit: tab + tuple val(meta), path("*raw_proteingroups.rds") , emit: rdata + tuple val(meta), path("*raw_proteingroups_tab.tsv") , emit: tab tuple val(meta), path("*normalised_proteingroups_tab.tsv") , emit: normtab -// tuple val(meta), path("*normalised_proteingroups_tab2.tsv") , emit: normtab2 - tuple val(meta), path("*R_sessionInfo.log") , emit: session_info -// path "versions.yml" , emit: versions +// tuple val(meta), path("*normalised_proteingroups_tab2.tsv"), emit: normtab2 + tuple val(meta), path("*R_sessionInfo.log") , emit: session_info + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when diff --git a/modules/nf-core/proteus/meta.yml b/modules/nf-core/proteus/meta.yml index 99f0bb0d..b657d809 100644 --- a/modules/nf-core/proteus/meta.yml +++ b/modules/nf-core/proteus/meta.yml @@ -1,9 +1,8 @@ name: "limma_differential" description: runs a differential expression analysis with Limma keywords: - - differential - - expression - - microarray + - intensities + - proteomics - limma tools: diff --git a/modules/nf-core/proteus/templates/proteus_readproteingroups.R b/modules/nf-core/proteus/templates/proteus_readproteingroups.R index 8f430cd5..263e18e0 100755 --- a/modules/nf-core/proteus/templates/proteus_readproteingroups.R +++ b/modules/nf-core/proteus/templates/proteus_readproteingroups.R @@ -103,12 +103,12 @@ round_dataframe_columns <- function(df, columns = NULL, digits = 8){ # Set defaults and classes opt <- list( - quant_file = '$quants', + intensities_file = '$intensities', sample_file = '$samplesheet', contrast_variable = NULL, protein_id_col = 'Majority protein IDs', sample_id_col = 'sample', - measure_col_prefix = 'Intensity', + measure_col_prefix = 'intensities', normfuns = 'normalizeMedian', plotSampleDistributions_method = 'violin', plotMV_loess = T, @@ -134,7 +134,7 @@ for ( ao in names(args_opt)){ # Check if required parameters have been provided -required_opts <- c('quant_file', 'sample_file', 'contrast_variable') +required_opts <- c('intensities_file', 'sample_file', 'contrast_variable') missing <- required_opts[unlist(lapply(opt[required_opts], is.null)) | ! required_opts %in% names(opt)] if (length(missing) > 0){ @@ -143,7 +143,7 @@ if (length(missing) > 0){ # Check file inputs are valid -for (file_input in c('quant_file', 'sample_file')){ +for (file_input in c('intensities_file', 'sample_file')){ if (is.null(opt[[file_input]])) { stop(paste("Please provide", file_input), call. = FALSE) } @@ -165,13 +165,13 @@ library(proteus) ################################################ ################################################ -## READ IN QUANTS FILE AND SAMPLE METADATA ## +# READ IN INTENSITIES FILE AND SAMPLE METADATA # ################################################ ################################################ -quant.table <- +intensities.table <- read_delim_flexible( - file = opt\$quant_file, + file = opt\$intensities_file, check.names = FALSE ) @@ -181,8 +181,8 @@ sample.sheet <- check.names=FALSE ) -if (! opt\$protein_id_col %in% colnames(quant.table)){ - stop(paste0("Specified protein ID column '", opt\$protein_id_col, "' is not in the quant table")) +if (! opt\$protein_id_col %in% colnames(intensities.table)){ + stop(paste0("Specified protein ID column '", opt\$protein_id_col, "' is not in the intensities table")) } if (! opt\$sample_id_col %in% colnames(sample.sheet)){ @@ -197,16 +197,16 @@ sample.sheet\$condition <- sample.sheet[[opt\$contrast_variable]] # Add prefix for proteinGroups measurement columns to the sample IDs from the sampesheet measure.cols <- setNames(paste0(opt\$measure_col_prefix, sample.sheet[[opt\$sample_id_col]]), sample.sheet[[opt\$sample_id_col]]) -# Check that all samples specified in the input sheet are present in the quants table +# Check that all samples specified in the input sheet are present in the intensities table missing_columns <- paste0(opt\$measure_col_prefix, sample.sheet[[opt\$sample_id_col]]) -missing_columns <- missing_columns[!missing_columns %in% colnames(quant.table)] +missing_columns <- missing_columns[!missing_columns %in% colnames(intensities.table)] if (length(missing_columns) > 0) { stop(paste( length(missing_columns), 'specified samples do not have a(n)', opt\$measure_col_prefix, - 'column in quant table. The following columns are missing:', + 'column in intensities table. The following columns are missing:', paste(missing_columns, collapse = ', ') )) } @@ -242,7 +242,7 @@ output_prefix <- opt\$contrast_variable proteinColumns <- setNames(gsub("Majority protein IDs", opt\$protein_id_col, proteus::proteinColumns), names(proteus::proteinColumns)) proteinGroups <- readProteinGroups( - file=opt\$quant_file, + file=opt\$intensities_file, meta=sample.sheet, measure.cols=measure.cols, data.cols=proteinColumns @@ -286,7 +286,7 @@ for (normfun in normfuns) { saveRDS(proteinGroups.normalized, file = paste0(output_prefix, '.proteus.', normfun, 'normalised_proteingroups.rds')) - # Write normalized quant matrix + # Write normalized intensities matrix out_df <- data.frame( proteinGroups.normalized\$tab, @@ -322,7 +322,7 @@ dev.off() saveRDS(proteinGroups, file = paste0(output_prefix, '.proteus.raw_proteingroups.rds')) -# Write raw quant matrix +# Write raw intensities matrix out_df <- data.frame( proteinGroups\$tab, diff --git a/modules/nf-core/rmarkdownnotebook/main.nf b/modules/nf-core/rmarkdownnotebook/main.nf index 4bcdcc35..ec8f21b1 100644 --- a/modules/nf-core/rmarkdownnotebook/main.nf +++ b/modules/nf-core/rmarkdownnotebook/main.nf @@ -60,7 +60,6 @@ process RMARKDOWNNOTEBOOK { } """ - echo $parameters > /home/iivow01/git/differentialabundance/error2/parameters # Dump .params.yml heredoc (section will be empty if parametrization is disabled) ${indent_code_block(params_cmd, 4)} diff --git a/modules/nf-core/shinyngs/validatefomcomponents/main.nf b/modules/nf-core/shinyngs/validatefomcomponents/main.nf index 97ec6448..4a80e042 100644 --- a/modules/nf-core/shinyngs/validatefomcomponents/main.nf +++ b/modules/nf-core/shinyngs/validatefomcomponents/main.nf @@ -30,7 +30,6 @@ process SHINYNGS_VALIDATEFOMCOMPONENTS { def feature = feature_meta ? "--feature_metadata '$feature_meta'" : '' """ - echo $args > "/home-link/iivow01/git/differentialabundance/error2/val" validate_fom_components.R \\ --sample_metadata "$sample" \\ $feature \\ From 7757ba33509b9198a3487842e693cfefc321de37 Mon Sep 17 00:00:00 2001 From: WackerO Date: Thu, 6 Jul 2023 13:18:39 +0200 Subject: [PATCH 05/30] changed proteus configs --- conf/modules.config | 10 + .../nf-core/proteus/readproteingroups/main.nf | 32 ++ .../proteus/readproteingroups/meta.yml | 74 ++++ .../templates/proteus_readproteingroups.R | 366 ++++++++++++++++++ nextflow.config | 3 + 5 files changed, 485 insertions(+) create mode 100644 modules/nf-core/proteus/readproteingroups/main.nf create mode 100644 modules/nf-core/proteus/readproteingroups/meta.yml create mode 100644 modules/nf-core/proteus/readproteingroups/templates/proteus_readproteingroups.R diff --git a/conf/modules.config b/conf/modules.config index 625683e7..ad6ac534 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -34,6 +34,16 @@ process { ] ext.args = "--feature-type transcript" } + + withName: PROTEUS { + publishDir = [ + [ + path: { "${params.outdir}/proteus" } + ] + ] + //ext.args = "--sample_id_col \"Sample Number\" --measure_col_prefix \"LFQ intensity \"" + ext.args = "--sample_id_col \"${params.observations_id_col}\" --measure_col_prefix \"${params.maxquant_measurecol_prefix}\" " + } withName: VALIDATOR { publishDir = [ diff --git a/modules/nf-core/proteus/readproteingroups/main.nf b/modules/nf-core/proteus/readproteingroups/main.nf new file mode 100644 index 00000000..02c61e19 --- /dev/null +++ b/modules/nf-core/proteus/readproteingroups/main.nf @@ -0,0 +1,32 @@ +process PROTEUS_READPROTEINGROUPS { + tag "$meta" + label 'process_single' + + conda "r-base=4.2.1 r-proteus-bartongroup=0.2.16 bioconductor-limma=3.54.0 r-plotly=4.10.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-4e01206f2c47f56077f04e5d2d7b312f50513a1e:92abccefbeb09795ad6a93553b62a6ad3daaea48-0': + 'quay.io/biocontainers/mulled-v2-4e01206f2c47f56077f04e5d2d7b312f50513a1e:92abccefbeb09795ad6a93553b62a6ad3daaea48-0' }" + + input: + tuple val(meta), path(samplesheet), path(intensities) + tuple val(meta2), val(contrast_variable) + + + output: + tuple val(meta), path("*dendrogram.png") , emit: dendro_plot + tuple val(meta), path("*mean_variance_relationship.png") , emit: mean_var_plot + tuple val(meta), path("*normalised_distributions.png") , emit: raw_dist_plot + tuple val(meta), path("*normalised_distributions.png") , emit: norm_dist_plot + tuple val(meta), path("*raw_proteingroups.rds") , emit: raw_rdata + tuple val(meta), path("*raw_proteingroups_tab.tsv") , emit: raw_tab + tuple val(meta), path("*normalised_proteingroups.rds") , emit: norm_rdata + tuple val(meta), path("*normalised_proteingroups_tab.tsv") , emit: norm_tab + tuple val(meta), path("*R_sessionInfo.log") , emit: session_info + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + template 'proteus_readproteingroups.R' +} diff --git a/modules/nf-core/proteus/readproteingroups/meta.yml b/modules/nf-core/proteus/readproteingroups/meta.yml new file mode 100644 index 00000000..9ec8ac8e --- /dev/null +++ b/modules/nf-core/proteus/readproteingroups/meta.yml @@ -0,0 +1,74 @@ +name: "proteus_readproteingroups" +description: reads a maxQuant proteinGroups file with Proteus +keywords: + - intensities + - proteomics + - proteus +tools: + - "proteus": + description: "R package for analysing proteomics data" + homepage: "https://github.com/bartongroup/Proteus" + documentation: "https://rdrr.io/github/bartongroup/Proteus/" + tool_dev_url: "https://github.com/bartongroup/Proteus" + doi: "10.1101/416511" + licence: "['GPL v2']" + +input: + - meta: + type: map + description: | + Groovy Map containing contrast information, e.g. [ variable:'treatment', reference:'treated', + control:'saline', blocking:'' ] + - samplesheet: + type: file + description: | + CSV or TSV format sample sheet with sample metadata; check here for specifications: https://rdrr.io/github/bartongroup/Proteus/man/readProteinGroups.html + - intensities: + type: file + description: | + proteinGroups TXT file with protein intensities information from maxQuant; check here for specifications: https://rdrr.io/github/bartongroup/Proteus/man/readProteinGroups.html + +output: + - raw_dist_plot: + type: file + description: | + PNG file; plot of the intensity/ratio distributions of the raw samples + - norm_dist_plot: + type: file + description: | + PNG file; plot of the intensity/ratio distributions of the normalized samples + - mean_var_plot: + type: file + description: | + PNG file; plot of the log-intensity variance vs log-intensity mean of each condition in the normalized samples + - dendro_plot: + type: file + description: | + PNG file; dendrogram of the normalized samples hierarchically clustered by their intensities + - raw_rdata: + type: file + description: | + RDS file of a proteinGroups object from Proteus, contains raw protein intensities and additional info + - raw_tab: + type: file + description: | + TSV-format intensities table from Proteus, contains raw protein intensities + - norm_rdata: + type: file + description: | + RDS file of a proteinGroups object from Proteus, contains normalized protein intensities and additional info + - norm_tab: + type: file + description: | + TSV-format intensities table from Proteus, contains normalized protein intensities + - session_info: + type: file + description: | + LOG file of the R sessionInfo from the module run + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@WackerO" diff --git a/modules/nf-core/proteus/readproteingroups/templates/proteus_readproteingroups.R b/modules/nf-core/proteus/readproteingroups/templates/proteus_readproteingroups.R new file mode 100644 index 00000000..14eafdf2 --- /dev/null +++ b/modules/nf-core/proteus/readproteingroups/templates/proteus_readproteingroups.R @@ -0,0 +1,366 @@ +#!/usr/bin/env Rscript + +################################################ +################################################ +## Functions ## +################################################ +################################################ + +#' Parse out options from a string without recourse to optparse +#' +#' @param x Long-form argument list like --opt1 val1 --opt2 val2 +#' +#' @return named list of options and values similar to optparse + +parse_args <- function(x) { + args_list <- unlist(strsplit(x, ' ?--')[[1]])[-1] + args_vals <- lapply(args_list, function(x) scan(text=x, what='character', quiet = TRUE)) + + # Ensure the option vectors are length 2 (key/ value) to catch empty ones + args_vals <- lapply(args_vals, function(z) { length(z) <- 2; z}) + + parsed_args <- structure(lapply(args_vals, function(x) x[2]), names = lapply(args_vals, function(x) x[1])) + parsed_args[! is.na(parsed_args)] +} + +#' Flexibly read CSV or TSV files +#' +#' @param file Input file +#' @param header Passed to read.delim() +#' @param row.names Passed to read.delim() +#' +#' @return output Data frame + +read_delim_flexible <- function(file, header = TRUE, row.names = NULL, check.names = F) { + + ext <- tolower(tail(strsplit(basename(file), split = "\\\\.")[[1]], 1)) + + if (ext == "tsv" || ext == "txt") { + separator <- "\\t" + } else if (ext == "csv") { + separator <- "," + } else { + stop(paste("Unknown separator for", ext)) + } + + read.delim( + file, + sep = separator, + header = header, + row.names = row.names, + check.names = check.names + ) +} + +#' Round numeric dataframe columns to fixed decimal places by applying +#' formatting and converting back to numerics +#' +#' @param dataframe A data frame +#' @param columns Which columns to round (assumes all of them by default) +#' @param digits How many decimal places to round to? +#' +#' @return output Data frame +# TODO check if this is necessary +round_dataframe_columns <- function(df, columns = NULL, digits = 8) { + if (is.null(columns)) { + columns <- colnames(df) + } + + df[,columns] <- format( + data.frame(df[, columns], check.names = FALSE), + nsmall = digits + ) + + # Convert columns back to numeric + + for (c in columns) { + df[[c]][grep("^ *NA\$", df[[c]])] <- NA + df[[c]] <- as.numeric(df[[c]]) + } + df +} + +################################################ +################################################ +## PARSE PARAMETERS FROM NEXTFLOW ## +################################################ +################################################ + +# I've defined these in a single array like this so that we could go back to an +# optparse-driven method in future with module bin/ directories, rather than +# the template + +# Set defaults and classes + +opt <- list( + intensities_file = '$intensities', + sample_file = '$samplesheet', + contrast_variable = '$contrast_variable', + protein_id_col = 'Majority protein IDs', + sample_id_col = 'sample', + measure_col_prefix = 'intensities', + normfuns = 'normalizeMedian', + plotSampleDistributions_method = 'violin', + plotMV_loess = T, + palette_name = 'Set1' +) +opt_types <- lapply(opt, class) + +# Apply parameter overrides + +args_opt <- parse_args('$task.ext.args') +for ( ao in names(args_opt)) { + if (! ao %in% names(opt)) { + stop(paste("Invalid option:", ao)) + } else { + + # Preserve classes from defaults where possible + if (! is.null(opt[[ao]])) { + args_opt[[ao]] <- as(args_opt[[ao]], opt_types[[ao]]) + } + opt[[ao]] <- args_opt[[ao]] + } +} + +# Check if required parameters have been provided + +required_opts <- c('intensities_file', 'sample_file', 'contrast_variable') +missing <- required_opts[unlist(lapply(opt[required_opts], is.null)) | ! required_opts %in% names(opt)] + +if (length(missing) > 0) { + stop(paste("Missing required options:", paste(missing, collapse=', '))) +} + +# Check file inputs are valid + +for (file_input in c('intensities_file', 'sample_file')) { + if (is.null(opt[[file_input]])) { + stop(paste("Please provide", file_input), call. = FALSE) + } + + if (! file.exists(opt[[file_input]])) { + stop(paste0('Value of ', file_input, ': ', opt[[file_input]], ' is not a valid file')) + } +} + +################################################ +################################################ +## Finish loading libraries ## +################################################ +################################################ + +library(limma) +library(plotly) +library(proteus) + +################################################ +################################################ +# READ IN INTENSITIES FILE AND SAMPLE METADATA # +################################################ +################################################ + +intensities.table <- + read_delim_flexible( + file = opt\$intensities_file, + check.names = FALSE + ) + +sample.sheet <- + read_delim_flexible( + file = opt\$sample_file, + check.names=FALSE + ) + +if (! opt\$protein_id_col %in% colnames(intensities.table)) { + stop(paste0("Specified protein ID column '", opt\$protein_id_col, "' is not in the intensities table; exiting...Valid columns are: ", paste(colnames(intensities.table), collapse=", "))) +} + +if (! opt\$sample_id_col %in% colnames(sample.sheet)) { + stop(paste0("Specified sample ID column '", opt\$sample_id_col, "' is not in the sample sheet; exiting...Valid columns are: ", paste(colnames(sample.sheet), collapse=", "))) +} + +# Add metadata columns that are necessary for proteus + +sample.sheet\$sample <- sample.sheet[[opt\$sample_id_col]] +sample.sheet\$condition <- sample.sheet[[opt\$contrast_variable]] + +# Add prefix for proteinGroups measurement columns to the sample IDs from the sampesheet +measure.cols <- setNames(paste0(opt\$measure_col_prefix, sample.sheet[[opt\$sample_id_col]]), sample.sheet[[opt\$sample_id_col]]) + +# Check that all samples specified in the input sheet are present in the intensities table + +missing_columns <- paste0(opt\$measure_col_prefix, sample.sheet[[opt\$sample_id_col]]) +missing_columns <- missing_columns[!missing_columns %in% colnames(intensities.table)] +if (length(missing_columns) > 0) { + stop(paste( + length(missing_columns), + 'specified samples do not have a(n)', + opt\$measure_col_prefix, + 'column in intensities table. The following columns are missing:', + paste(missing_columns, collapse = ', ') + )) +} + +################################################ +################################################ +## CHECK AND FORMAT NORMFUN AND FILTERFUN ## +################################################ +################################################ + +valid_normfuns <- c("normalizeMedian", "normalizeQuantiles") +normfuns <- opt\$normfuns + +# Check validity of normfun(s) +invalid_normfuns <- normfuns[!(normfuns %in% valid_normfuns)] +if (length(invalid_normfuns)>0) { + stop(paste0("Invalid normfuns argument(s): ", + paste(invalid_normfuns, collapse=", "), + ". Valid normfuns are: ", + paste(valid_normfuns, collapse=", "), + "; exiting...")) +} + +################################################ +################################################ +## Run Proteus processes and generate outputs ## +################################################ +################################################ + +output_prefix <- opt\$contrast_variable + +# Replace proteus default ID column with user param and re-set the names of the resulting object (gsub sets the names to NULL) + +proteinColumns <- setNames(gsub("Majority protein IDs", opt\$protein_id_col, proteus::proteinColumns), names(proteus::proteinColumns)) +proteinGroups <- readProteinGroups( + file=opt\$intensities_file, + meta=sample.sheet, + measure.cols=measure.cols, + data.cols=proteinColumns +) + +# Generate plots for all requested normalizations; also, save normalized protein groups for limma + +for (normfun in normfuns) { + proteinGroups.normalized <- normalizeData(proteinGroups, norm.fun = eval(parse(text=normfun))) # Proteus also accepts other norm.funs, e.g. from limma + + # Apply log2 and remove NAs as these will otherwise mess with some of the following modules + + proteinGroups.normalized\$tab <- na.omit(log2(proteinGroups.normalized\$tab)) + + png(paste(output_prefix, 'proteus', normfun, 'normalized_distributions.png', sep = '.'), width = 5*300, height = 5*300, res = 300, pointsize = 8) + print( + plotSampleDistributions(proteinGroups.normalized, title=paste0("Sample distributions after applying\n", normfun), fill="condition", method=opt\$plotSampleDistributions_method) + + scale_fill_brewer(palette=opt\$palette_name, name=opt\$contrast_variable) + + theme(plot.title = element_text(size = 12)) + ) + dev.off() + + png(paste(output_prefix, 'proteus', normfun, 'normalized_mean_variance_relationship.png', sep = '.'), width = 5*300, height = 5*300, res = 300, pointsize = 8) + print( + plotMV(proteinGroups.normalized, with.loess=opt\$plotMV_loess) + + ggtitle(paste0("Sample mean variance relationship after applying\n", normfun)) + + scale_fill_distiller(palette=opt\$palette_name) + + theme(plot.title = element_text(size = 12)) + ) + dev.off() + + png(paste(output_prefix, 'proteus', normfun, 'normalized_dendrogram.png', sep = '.'), width = 5*300, height = 5*300, res = 300, pointsize = 8) + print( + plotClustering(proteinGroups.normalized) + + ggtitle(paste0("Sample clustering after applying\n", normfun)) + + theme(plot.title = element_text(size = 12)) + ) + dev.off() + + # R object for other processes to use + + saveRDS(proteinGroups.normalized, file = paste(output_prefix, 'proteus', normfun, 'normalized_proteingroups.rds', sep=".")) + + # Write normalized intensities matrix + + out_df <- data.frame( + proteinGroups.normalized\$tab, + check.names = FALSE + ) + out_df[[opt\$protein_id_col]] <- rownames(proteinGroups.normalized\$tab) # proteus saves the IDs as rownames; make column from those + out_df <- out_df[c(opt\$protein_id_col, colnames(out_df)[colnames(out_df) != opt\$protein_id_col])] # move ID column to first position + write.table( + out_df, + file = paste(output_prefix, 'proteus', normfun, 'normalized_proteingroups_tab', 'tsv', sep = '.'), + col.names = TRUE, + row.names = FALSE, + sep = '\t', + quote = FALSE + ) +} + +# Process and save raw table + +proteinGroups\$tab <- na.omit(log2(proteinGroups\$tab)) + +# Generate raw distribution plot + +png(paste(output_prefix, 'proteus.raw_distributions.png', sep = '.'), width = 5*300, height = 5*300, res = 300, pointsize = 8) +print( + plotSampleDistributions(proteinGroups, title="Raw sample distributions", fill="condition", method=opt\$plotSampleDistributions_method) + + scale_fill_brewer(palette=opt\$palette_name, name=opt\$contrast_variable) + + theme(plot.title = element_text(size = 12)) + ) +dev.off() +# R object for other processes to use + +saveRDS(proteinGroups, file = paste(output_prefix, 'proteus.raw_proteingroups.rds', sep = '.')) + +# Write raw intensities matrix + +out_df <- data.frame( + proteinGroups\$tab, + check.names = FALSE + ) +out_df[[opt\$protein_id_col]] <- rownames(proteinGroups\$tab) # proteus saves the IDs as rownames; make column from those +out_df <- out_df[c(opt\$protein_id_col, colnames(out_df)[colnames(out_df) != opt\$protein_id_col])] # move ID column to first position + + +write.table( + out_df, + file = paste(output_prefix, 'proteus', 'raw_proteingroups_tab', 'tsv', sep = '.'), + col.names = TRUE, + row.names = FALSE, + sep = '\t', + quote = FALSE +) + +################################################ +################################################ +## R SESSION INFO ## +################################################ +################################################ + +sink("R_sessionInfo.log") +print(sessionInfo()) +sink() + +################################################ +################################################ +## VERSIONS FILE ## +################################################ +################################################ + +r.version <- strsplit(version[['version.string']], ' ')[[1]][3] +limma.version <- as.character(packageVersion('limma')) +plotly.version <- as.character(packageVersion('plotly')) +proteus.version <- as.character(packageVersion('proteus')) +writeLines( + c( + '"${task.process}":', + paste(' r-base:', r.version), + paste(' bioconductor-limma:', limma.version), + paste(' r-plotly:', plotly.version), + paste(' r-proteus-bartongroup:', proteus.version) + ), +'versions.yml') + +################################################ +################################################ +################################################ +################################################ \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index a62f37a8..7e27beca 100644 --- a/nextflow.config +++ b/nextflow.config @@ -52,6 +52,9 @@ params { affy_rm_extra = false affy_build_annotation = true + // MaxQuant-specific options + maxquant_measurecol_prefix = 'LFQ intensity ' + // Filtering options filtering_min_samples = 1 filtering_min_abundance = 1 From 2a7a5ee86a48b47d7c19e4e0c403823d9ace5212 Mon Sep 17 00:00:00 2001 From: WackerO Date: Thu, 13 Jul 2023 12:37:21 +0200 Subject: [PATCH 06/30] Installed and integrated proteus --- .github/workflows/ci.yml | 1 + assets/Px_report.Rmd | 979 ------------------ conf/modules.config | 41 +- modules.json | 5 + modules/nf-core/proteus/main.nf | 31 - modules/nf-core/proteus/meta.yml | 59 -- .../nf-core/proteus/readproteingroups/main.nf | 12 +- .../proteus/readproteingroups/meta.yml | 38 +- .../templates/proteus_readproteingroups.R | 132 ++- .../templates/proteus_readproteingroups.R | 376 ------- nextflow.config | 12 +- nextflow_schema.json | 48 +- workflows/differentialabundance.nf | 76 +- 13 files changed, 239 insertions(+), 1571 deletions(-) delete mode 100644 assets/Px_report.Rmd delete mode 100644 modules/nf-core/proteus/main.nf delete mode 100644 modules/nf-core/proteus/meta.yml delete mode 100755 modules/nf-core/proteus/templates/proteus_readproteingroups.R diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 771f018e..e085f338 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -30,6 +30,7 @@ jobs: - "test" - "test_nogtf" - "test_affy" + - "test_maxquant" steps: - name: Check out pipeline code uses: actions/checkout@v3 diff --git a/assets/Px_report.Rmd b/assets/Px_report.Rmd deleted file mode 100644 index 2b35a0d7..00000000 --- a/assets/Px_report.Rmd +++ /dev/null @@ -1,979 +0,0 @@ ---- -output: - html_document: - toc: true # table of contents - toc_float: true # float the table of contents to the left of the main document content - toc_depth: 4 # header levels 1,2,3 - theme: default - number_sections: false # add section numbering to headers - df_print: paged # tables are printed as an html table with support for pagination over rows and columns - highlight: pygments - pdf_document: true - pdf_document: - toc: yes -date: "`r Sys.Date()`" -params: - meta: NULL - input_dir: NULL - artifact_dir: NULL - cpus: 1 - study_type: NULL - study_name: NULL - study_abundance_type: NULL - report_file: NULL, - report_title: NULL, - report_author: NULL, - report_description: NULL, - observations_type: NULL - observations: NULL # GSE156533.samplesheet.csv - observations_id_col: NULL - observations_name_col: NULL - check_log: NULL - features: NULL - features_type: NULL - features_id_col: NULL - features_name_col: NULL - features_metadata_cols: NULL - raw_matrix: null # e.g. 0_salmon.merged.gene_counts.tsv - normalised_matrix: null - variance_stabilised_matrix: null # e.g. test_files/3_treatment-WT-P23H.vst.tsv - contrasts_file: null # e.g. GSE156533.contrasts.csv - differential_table: file.csv - affy_cel_files_archive: NULL - affy_file_name_col: NULL - affy_background: NULL - affy_bgversion: NULL - affy_destructive: NULL - affy_cdfname: NULL - affy_rm_mask: NULL - affy_rm_outliers: NULL - affy_rm_extra: NULL - affy_build_annotation: NULL - limma_ndups: NULL - limma_spacing: NULL - limma_block: NULL - limma_correlation: NULL - limma_method: NULL - limma_proportion: NULL - limma_stdev_coef_lim: NULL - limma_trend: NULL - limma_robust: NULL - limma_winsor_tail_p: NULL - limma_adjust_method: NULL - limma_p_value: NULL - limma_lfc: NULL - limma_confint: NULL - exploratory_n_features: null - exploratory_clustering_method: null - exploratory_cor_method: null - exploratory_whisker_distance: null - exploratory_mad_threshold: null - exploratory_main_variable: null - exploratory_assay_names: NULL - exploratory_final_assay: NULL - exploratory_palette_name: NULL - versions_file: null # e.g 17_software_versions.yml - logo: null - css: null - citations: null - filtering_min_samples: 1 - filtering_min_abundance: 1 - filtering_min_proportion: NULL - filtering_grouping_var: NULL - differential_file_suffix: NULL - differential_feature_id_column: NULL - differential_feature_name_column: NULL - differential_fc_column: NULL - differential_pval_column: NULL - differential_qval_column: NULL - differential_min_fold_change: NULL - differential_foldchanges_logged: NULL - differential_max_pval: NULL - differential_max_qval: NULL - differential_palette_name: NULL - differential_subset_to_contrast_samples: NULL - deseq2_test: NULL - deseq2_fit_type: NULL - deseq2_sf_type: NULL - deseq2_min_replicates_for_replace: NULL - deseq2_use_t: NULL - deseq2_lfc_threshold: NULL - deseq2_alt_hypothesis: NULL - deseq2_independent_filtering: NULL - deseq2_p_adjust_method: NULL - deseq2_alpha: NULL - deseq2_minmu: NULL - deseq2_vs_method: NULL - deseq2_shrink_lfc: NULL - deseq2_cores: NULL - deseq2_vs_blind: NULL - deseq2_vst_nsub: NULL - gsea_run: false - gsea_nperm: NULL - gsea_permute: NULL - gsea_scoring_scheme: NULL - gsea_metric: NULL - gsea_sort: NULL - gsea_order: NULL - gsea_set_max: NULL - gsea_set_min: NULL - gsea_norm: NULL - gsea_rnd_type: NULL - gsea_make_sets: NULL - gsea_median: NULL - gsea_num: NULL - gsea_plot_top_x: NULL - gsea_rnd_seed: NULL - gsea_save_rnd_lists: NULL - gsea_zip_report: NULL - gsea_chip_file: NULL - gsea_gene_sets: NULL ---- - - - -```{r, include=FALSE} -library(knitr) -library(yaml) -library(shinyngs) -library(plotly) -library(ggplot2) -library(DT) -library(dplyr) - -# TODO -#mulled-build --use-mamba build 'r-base=4.2.3,r-rmarkdown=2.21,r-yaml=2.3.7,bioconductor-enhancedvolcano=1.16.0,anaconda::gmp=6.2.1,conda-forge::r-ggplot2=3.4.2,conda-forge::r-upsetr' -#mulled-build --use-mamba build 'r-base,r-rmarkdown,r-yaml,bioconductor-enhancedvolcano,anaconda::gmp,conda-forge::r-ggplot2,conda-forge::r-upsetr' -#conda-forge::r-base conda-forge::r-rmarkdown conda-forge::r-yaml bioconda::bioconductor-enhancedvolcano anaconda::gmp conda-forge::r-ggplot2 #conda-forge::r-upsetr -#/home-link/iivow01/tools/mambaforge/bin/python3.10 -#/home-link/iivow01/tools/mambaforge/envs/mulled/lib/python3.11/site-packages/galaxy/tool_util/deps/mulled -#galaxy.tool_util.deps.mulled.mulled_build -``` - -```{r include = FALSE} -# Load the datatables js -datatable(NULL) -``` - -```{r, include=FALSE} -versions <- unlist(yaml.load_file(file.path(params$input_dir, params$versions_file)), recursive = FALSE) -params_table <- data.frame(Parameter = names(unlist(params)), Value = unlist(params), row.names = NULL) - -# We'll subset the params table for different report sections -make_params_table <- function(name, pattern = NULL, remove_pattern = FALSE){ - subparams <- params_table - if (! is.null(pattern)){ - subparams <- subparams[grep(pattern, subparams$Parameter),] - } - if (remove_pattern){ - subparams$Parameter <- sub(pattern, '', subparams$Parameter) - } - - if (nrow(subparams) > 10){ - dom <- 'tp' - }else{ - dom <- 't' - } - - print( htmltools::tagList(datatable(subparams, caption = paste("Parameters used for", name), rownames = FALSE, options = list(dom = dom)) )) -} - -report_title <- paste0('Differential ', params$features_type, ' abundance report', ifelse(is.null(params$report_title), '', paste0(': ', params$report_title))) -report_subtitle <- paste0(ifelse(is.null(params$report_author), '', paste0('By ', params$report_author, ', ')), 'differentialabundance workflow version', versions[["Workflow.nf-core/differentialabundance"]]) -``` - ---- -title: "`r report_title`" -subtitle: `r report_subtitle` ---- - - - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - - - -```{r, echo=FALSE} -htmltools::includeCSS(params$css) -``` - -```{r results="asis", echo=FALSE} -cat(paste0(" - -")) -``` - - - -```{r, echo=FALSE} -observations <- read_metadata(file.path(params$input_dir, params$observations), id_col = params$observations_id_col) -if (! params$observations_name_col %in% colnames(observations)){ - stop(paste('Invalid observation name column specified: ', params$observations_name_col, paste0('(Valid values are: ', paste(colnames(observations), collapse=', '),')'))) -} - -if (! is.null(params$features)){ - features <- read_metadata(file.path(params$input_dir, params$features)) - features <- features[,colnames(features) %in% simpleSplit(params$features_metadata_cols), drop = FALSE] -} - -contrasts <- read_metadata(file.path(params$input_dir, params$contrasts_file)) -contrasts$blocking <- na.replace(contrasts$blocking, '') -if (! 'id' %in% colnames(contrasts)){ - contrasts$id <- apply(contrasts, 1, paste, collapse='_') -} - -# Identify informative variables- those with a number of values greater than 1 -# but less than N, with N being the number of observations. Make sure contrast -# variables are first in the list - -informative_variables <- unique(c(contrasts$variable, chooseGroupingVariables(observations))) - -# Remove any informative variables that group observations the same way -informative_variables <- informative_variables[ ! duplicated(lapply(structure(informative_variables, names= informative_variables), function(x) as.numeric(factor(observations[[x]], levels=unique(observations[[x]])))))] - -assay_names <- simpleSplit(params$exploratory_assay_names) -names(assay_names) = assay_names -assay_files <- lapply(assay_names, function(x) params[[paste0(x, '_matrix')]]) -capture.output(assay_files, file="/home-link/iivow01/git/differentialabundance/error/assay_files") - -assay_data <- lapply(assay_files, function(x) { - mat <- read_matrix( - x, - sample_metadata = observations, - row.names = 1 - ) - colnames(mat) <- observations[[params$observations_name_col]][match(colnames(mat), rownames(observations))] - - # Bit hacky, but ensure log - if (params$check_log && max(mat, na.rm=T) > 20){ - log2(mat+1) - }else{ - mat - } -}) - -# Now we can rename the observations rows using the title field -rownames(observations) <- observations[[params$observations_name_col]] - -# Run PCA early so we can understand how important each variable is - -pca_datas <- lapply(names(assay_data), function(assay_type){ - compilePCAData(assay_data[[assay_type]]) -}) -names(pca_datas) <- names(assay_data) - -pca_vs_meta <- anova_pca_metadata(pca_datas[[params$exploratory_final_assay]]$coords, observations[,informative_variables, drop = FALSE], pca_datas[[params$exploratory_final_assay]]$percentVar) - -# Show the variable with the tightest PC associations first -informative_variables <- rownames(pca_vs_meta)[order(pca_vs_meta[,1])] - -# Pick the variable used for coloring purposes etc -if (params$exploratory_main_variable == 'contrasts'){ - main_grouping_variable <- contrasts$variable[1] -}else if (params$exploratory_main_variable == 'auto_pca'){ - main_grouping_variable <- informative_variables[1] -}else{ - if (! params$exploratory_main_variable %in% colnames(observations)){ - stop(paste('Invalid main variable specified: ', params$exploratory_main_variable)) - } - main_grouping_variable <- params$exploratory_main_variable -} - -# Make sure the main variable is shown first, with remaining shown in order of -# informativeness - -informative_variables <- unique(c(main_grouping_variable, informative_variables)) - -groupColorScale <- makeColorScale(length(unique(observations[[main_grouping_variable]])), palette = params$exploratory_palette_name) -``` - - - -```{r, echo=FALSE} - -prefix_part_names <- c('variable', 'reference', 'target', 'blocking') -diff_prefixes <- sub('-$', '', apply(contrasts[,prefix_part_names], 1, function(x) paste(x, collapse = '-'))) - -differential_files <- lapply(diff_prefixes, function(d){ - file.path(params$input_dir, paste0(gsub(' |;', '_', d), params$differential_file_suffix)) -}) - -differential_results <- lapply(differential_files, function(diff_file){ - if (! file.exists(diff_file)){ - stop(paste("Differential file", diff_file, "does not exist")) - } - diff <- read_differential( - diff_file, - feature_id_column = params$differential_feature_id_column, - fc_column = params$differential_fc_column, - pval_column = params$differential_pval_column, - qval_column = params$differential_qval_column - ) - - # If fold changes are not logged already, log them (we assume they're logged - # later on) - - if (! params$differential_foldchanges_logged){ - diff[[params$differential_fc_column]] <- log2(diff[[params$differential_fc_column]]) - } - - # Annotate differential tables if possible - - if (! is.null(params$features)){ - diff <- merge(features, diff, by.x = params$features_id_col, by.y = params$differential_feature_id_column) - } - diff -}) -names(differential_results) <- diff_prefixes -``` - - - -```{r, echo=FALSE} - -contrast_descriptions <- paste(contrasts$target, 'versus', contrasts$reference, 'in', contrasts$variable) -with_blocking <- which(contrasts$blocking != '') -contrast_descriptions[with_blocking] <- paste0(contrast_descriptions[with_blocking], " (blocking on ", contrasts$blocking[with_blocking],")") - -# Check both adjusted and unadjusted p values - -p_value_types <- list(Adjusted = params$differential_qval_column, Unadjusted = params$differential_pval_column) -p_value_thresholds <- list(Adjusted = params$differential_max_qval, Unadjusted = params$differential_max_pval) - -sig_differential <- - lapply(names(p_value_types), function(pvt){ - diff <- lapply( - 1:nrow(contrasts), - function(x){ - signif <- differential_results[[x]][,p_value_types[[pvt]] ] < p_value_thresholds[[pvt]] - list( - up = differential_results[[x]][which( - differential_results[[x]][,params$differential_fc_column ] > log2(params$differential_min_fold_change) & - signif - ),], - down = differential_results[[x]][which( - differential_results[[x]][,params$differential_fc_column ] < log2(1/params$differential_min_fold_change) & - signif - ),] - ) - } - ) - names(diff) <- contrast_descriptions - diff - }) -names(sig_differential) <- names(p_value_types) - -# Count the differential genes -differential_tables <- lapply(names(sig_differential), function(sd) do.call(rbind, lapply(sig_differential[[sd]], function(x) lapply(x, function(y) nrow(y))))) -names(differential_tables) <- names(sig_differential) -``` - - - -# Abstract - -This report summarises differential `r params$features_type` analysis as performed by the nf-core/differentialabundance pipeline. - -# Data - -```{r, echo=FALSE, results='asis'} -cat(paste0("\n## ", ucfirst(params$observations_type), "s\n")) -``` - - -A summary of `r params$observations_type` metadata is below: - -```{r, echo=FALSE, results='asis'} -display_columns <- union(c(params$observations_id_col, unique(contrasts$variable)), informative_variables) -minimal_fetchngs_cols <- c('sample', 'sample_title', 'strandedness', 'library_strategy', 'scientific_name') - -# If the data came via fetchngs then we can infer a couple of things about the most useful columns - -if (all(minimal_fetchngs_cols %in% colnames(observations))){ - additional_useful_cols <- minimal_fetchngs_cols -}else{ - additional_useful_cols <- colnames(observations)[which(apply(observations, 2, function(x) max(nchar(x))) <= 20)] -} - -display_columns <- head(union(display_columns, additional_useful_cols), 5) - -# Also add informative columns -display_columns <- unique(c(display_columns, informative_variables)) -observations_to_print <- observations[,unique(display_columns)] -colnames(observations_to_print) <- prettifyVariablename(colnames(observations_to_print)) -print( htmltools::tagList(datatable(observations_to_print, caption = paste(ucfirst(params$observations_type), 'metadata'), rownames = FALSE, options = list(dom = 't')) )) - -``` - -## Contrasts - -Comparisons were made between `r params$observations_type` groups defined using using `r params$observation_type` metadata columns, as described in the following table of contrasts: - -```{r, echo=FALSE, results='asis'} -contrasts_to_print <- contrasts -colnames(contrasts_to_print) <- prettifyVariablename(colnames(contrasts_to_print)) -print( htmltools::tagList(datatable(contrasts_to_print, caption = paste0("Table of contrasts"), rownames = FALSE, options = list(dom = 't')) )) -``` - -# Results - -## Counts - -Input was a matrix of `r nrow(assay_data$raw)` `r params$features_type`s for `r ncol(assay_data$raw)` `r params$observations_type`s`r ifelse(nrow(assay_data$normalised) < nrow(assay_data$raw), paste0(', reduced to ', nrow(assay_data$normalised), ' ', params$features_type, 's after filtering for low abundance'), '')`. - -## Exploratory analysis - -### Abundance value distributions - -The following plots show the abundance value distributions of input matrices. A log2 transformation is applied where not already performed. - -```{r, include=FALSE} - -``` -#### Box plots {.tabset} - -```{r, echo=FALSE, results='asis', fig.height=8} -for (a in names(assay_data)) { - cat(paste0("\n##### ", prettifyVariablename(a), "\n")) - p <- ggplot_boxplot( - assay_data[[a]], - experiment = observations, - colorby = main_grouping_variable, - expressiontype = paste("count per", params$features_type), - palette = groupColorScale, - whisker_distance = params$exploratory_whisker_distance, - base_size=8 - ) - print(p) - cat("\n") -} -``` - -Whiskers in the above boxplots show `r params$exploratory_whisker_distance` times the inter-quartile range. - -#### Density plots - -```{r, echo=FALSE, results='asis', fig.height=8} -plotly_densityplot( - assay_data, - experiment = observations, - colorby = params$observations_name_col, - expressiontype = paste("count per", params$features_type), - makeColorScale(length(unique(observations[[params$observations_id_col]])), palette = "Set1") -) -``` - -```{r, echo=FALSE, results='asis'} -cat(paste0("\n### ", ucfirst(params$observations_type), " relationships\n")) -``` - -#### Principal components plots - -Principal components analysis was conducted based on the `r params$exploratory_n_features` most variable `r params$features_type`s. Each component was annotated with its percent contribution to variance. - -```{r, echo=FALSE, results='asis'} -for (assay_type in rev(names(assay_data))){ - - pca_data <- pca_datas[[assay_type]] - - for (iv in informative_variables){ - - cat(paste0("\n##### ", prettifyVariablename(assay_type), " (", iv, ")\n")) - - plotdata <- pca_data$coords - plotdata$colorby <- factor( - observations[[iv]], - levels = unique(observations[[iv]]) - ) - pcaColorScale <- makeColorScale(length(unique(observations[[iv]])), palette = params$exploratory_palette_name) - - # Make plotting data combining PCA coords with coloring groups etc - - plotdata$name <- rownames(plotdata) - percentVar <- pca_data$percentVar - labels <- paste0(colnames(plotdata), " (", sprintf("%.1f", percentVar), "%)") - ncats <- length(unique(plotdata$colorby)) - - plot_types <- list("2" = "scatter", "3" = "scatter3d") - - for (d in names(plot_types)) { - - # Default plot args whatever we're doing - - plot_args <- list( - x = pca_data$coords[, 1], - y = pca_data$coords[, 2], - xlab = labels[1], - ylab = labels[2], - colorby = plotdata$colorby, - plot_type = plot_types[[d]], - palette = pcaColorScale, - legend_title = prettifyVariablename(iv), - labels = plotdata$name, - show_labels = TRUE - ) - if (d == "3") { - plot_args$z <- pca_data$coords[, 3] - plot_args$zlab <- labels[3] - } - - print(htmltools::tagList(do.call("plotly_scatterplot", plot_args))) - } - } -} -``` - - -#### Scree plots {.tabset} - -The scree plot below shows the proportion of variance that is explained by each of the PCA components. - -```{r, echo=FALSE, results='asis', message=F} -for (assay_type in rev(names(assay_data))){ - imp <- summary(pca_datas[[assay_type]]) - cat(paste0("\n##### ", prettifyVariablename(assay_type), "\n")) - - pca <- prcomp(t(na.omit((assay_data[[assay_type]]))), scale = TRUE) - imp <- t(as.data.frame(summary(pca)$importance)[2,]) - imp <- as.data.frame(cbind(PCA=rownames(imp), imp)) - colnames(imp) <- c("PCA", "POV") - - write.table(imp, file=paste0("/home-link/iivow01/git/differentialabundance/error/", assay_type, "_", "imp.tsv"), quote=F, sep="\t") - write.table(imp$PCA, file=paste0("/home-link/iivow01/git/differentialabundance/error/", assay_type, "_", "imp_data1.tsv"), quote=F, sep="\t") - write.table(imp$POV, file=paste0("/home-link/iivow01/git/differentialabundance/error/", assay_type, "_", "imp_data2.tsv"), quote=F, sep="\t") - - p <- ggplot(data=imp, aes(x=factor(PCA, level=imp$PCA), y=POV)) + geom_bar(stat="identity") + - xlab('') + - ylab('Proportion of Variance') + - theme_bw() + - theme(legend.title = element_blank()) + - theme(text = element_text(size=12)) - print(p) - cat("\n") -} -``` - -#### Principal components/ metadata associations - -For the variance stabilised matrix, an ANOVA test was used to determine assocations between continuous principal components and categorical covariates (including the variable of interest). - -The resulting p values are illustrated below. - -```{r, echo=FALSE, results='asis'} - -# This is a little hack to work around a bug in d3heatmap with single-row data -# frames. -if (nrow(pca_vs_meta) == 1){ - plot_pca_meta <- rbind(pca_vs_meta, pca_vs_meta) -}else{ - plot_pca_meta <- pca_vs_meta -} - - -write.table(-log10(plot_pca_meta), file=paste0("/home-link/iivow01/git/differentialabundance/error/plot_pca_meta_log.tsv"), quote=F, sep="\t") -write.table((plot_pca_meta), file=paste0("/home-link/iivow01/git/differentialabundance/error/plot_pca_meta.tsv"), quote=F, sep="\t") - -d3heatmap::d3heatmap( - -log10(plot_pca_meta), - Rowv = FALSE, - dendrogram = 'none', - cellnote = plot_pca_meta, - cexCol = 0.8, - cexRow = 0.8, - height = (100 + (15 * nrow(plot_pca_meta))), - colors = colorRampPalette( - rev( - RColorBrewer::brewer.pal(n = 7, name = "RdYlBu") - ) - )(100) -) - -for (variable in rownames(pca_vs_meta)){ - sig_comps <- pca_vs_meta[variable,] < 0.1 - - if (any(sig_comps)){ - min_sig_comp <- min(which(sig_comps)) - - min_sig_comp_p <- sprintf("%.2f", pca_vs_meta[variable, min_sig_comp]) - cat(paste0('The variable \'', variable, '\' shows an association with ', colnames(pca_vs_meta)[min_sig_comp], ' (p = ', min_sig_comp_p,'). ')) - } -} -``` - -#### Clustering dendrograms {.tabset} - -A hierarchical clustering of `r params$features_type`s was undertaken based on the top `r params$exploratory_n_features` most variable `r params$features_type`s. Distances between `r params$features_type`s were estimated based on `r params$exploratory_cor_method` correlation, which were then used to produce a clustering via the `r params$exploratory_clustering_method` method with `hclust()` in R. - -```{r, echo=FALSE, results='asis'} -for (assay_type in rev(names(assay_data))){ - for (iv in informative_variables){ - cat(paste0("\n##### ", prettifyVariablename(assay_type), " (", iv, ")\n")) - capture.output(assay_data[[assay_type]], file=paste0("/home-link/iivow01/git/differentialabundance/error/selectstuff_", assay_type, "_", iv)) - capture.output(assay_type, file=paste0("/home-link/iivow01/git/differentialabundance/error/type")) - variable_genes <- selectVariableGenes(matrix = assay_data[[assay_type]], ntop = params$exploratory_n_features) - capture.output(assay_data[[assay_type]][variable_genes, ], file=paste0("/home-link/iivow01/git/differentialabundance/error/dendrostuff_", assay_type, "_", iv)) - - dendroColorScale <- makeColorScale(length(unique(observations[[iv]])), palette = params$exploratory_palette_name) - capture.output("assay_data[[assay_type]][variable_genes, ]", file=paste0("/home-link/iivow01/git/differentialabundance/error/dendrostuff_")) - p <- clusteringDendrogram( - 2^assay_data[[assay_type]][variable_genes, ], - observations[, iv, drop = FALSE], - colorby = iv, - cor_method = params$exploratory_cor_method, - plot_title = paste0( - paste0(params$observations_type," clustering dendrogram, "), - params$exploratory_n_features, - " most variable ", - params$features_type, - "s\n(", params$exploratory_clustering_method, " clustering, ", params$exploratory_cor_method, " correlation)"), - cluster_method = params$exploratory_clustering_method, - palette = dendroColorScale, - labelspace = 0.25 - ) - # Defaults in shinyngs make the text in this plot a bit big for the report, so - # scale it down a bit - print(p, vp=grid::viewport(gp=grid::gpar(cex=0.7))) - cat("\n") - } -} -``` - -### Outlier detection {.tabset} - -Outlier detection based on [median absolute deviation](https://wiki.arrayserver.com/wiki/index.php?title=CorrelationQC.pdf) was undertaken, the outlier scoring is plotted below. - -```{r, echo=FALSE, results='asis', warning=FALSE} - -# We can't look for ouliers in sets of less than 3 samples, so exclude variables -# unless the minimum group size is larger than that -iv_min_group_sizes <- unlist(lapply(informative_variables, function(x) min(table(observations[[x]])))) - -foo <- lapply(informative_variables[iv_min_group_sizes > 2], function(iv){ - - cat(paste("\n####", iv, "\n")) - - plotdata <- - madScore( - matrix = assay_data[[params$exploratory_final_assay]], - sample_sheet = observations, - groupby = iv - ) - - if (! is.null(plotdata)){ - mad_plot_args <- list( - x = plotdata$group, - y = plotdata$mad, - color = plotdata$outlier, - hline_thresholds = c("Outlier threshold" = params$exploratory_mad_threshold), - palette = makeColorScale(2, palette = params$differential_palette_name), - legend_title = "Outlier status", - labels = rownames(plotdata), - show_labels = TRUE, - xlab = "Sample group", - ylab = "MAD score" - ) - - print(htmltools::tagList(do.call("plotly_scatterplot", mad_plot_args))) - - outliers <- rownames(plotdata)[plotdata$outlier] - - if (length(outliers) == 0){ - cat(paste0("No outlying samples were detected in groups defined by ", iv,".\n")) - }else{ - cat(paste0(length(outliers), ' possible outliers were detected in groups defined by ', iv ,': ', paste(outliers, collapse=', '), "\n")) - } - } -}) - -``` - -## Differential analysis - -### Differential `r params$features_type` `r params$study_abundance_type` {.tabset} - -```{r, echo=FALSE, results='asis'} -foo <- lapply(names(p_value_types), function(pvt){ - cat("\n#### ", pvt, "\n") - print( htmltools::tagList(datatable(differential_tables[[pvt]], caption = paste0('Differential ', params$features_type, " ", params$abundance_type, ' (target relative to reference)'), options = list(dom = 't'), rownames = TRUE) )) - cat("\n") -}) -``` - -```{r, echo=FALSE, results='asis', eval = FALSE} - -differential_summary_string <- paste( - paste( - lapply( - 1:nrow(contrasts), - function(x){ - paste0( - "Contrast ", x, ' (', contrast_descriptions[x], ') ', "had ", differential_table[x,'up'], ' ', paste0(params$features_type, 's'), ' expressed significantly more highly in ', contrasts[x, 'target',], ' than ', contrasts[x, 'reference',], ' and ', differential_table[x,'down'], ' expressed at sifnificantly lower levels.' - ) - } - ), - collapse = ' ' - ) -) -cat(differential_summary_string) -``` - -### Volcano plots - -```{r, echo=FALSE, results='asis'} - -# Set up palette of 4 colors -volcano_palette <- colorRampPalette(colors = c("gray", "green", "blue", "red"))(4) - -# Two functions to add vertical/horizontal lines to the volcano plot -vline <- function(x = 0, color = "black") { - list( - type = "line", - y0 = 0, - y1 = 1, - yref = "paper", - x0 = x, - x1 = x, - line = list(color = color, dash="dot") - ) -} -hline <- function(y = 0, color = "black") { - list( - type = "line", - x0 = 0, - x1 = 1, - xref = "paper", - y0 = y, - y1 = y, - line = list(color = color, dash="dot") - ) -} - -for (i in 1:nrow(contrasts)){ - cat("\n#### ", contrast_descriptions[i], "\n") - - ## Make a volcano plot for the contrast first - - # Label features with symbol as well as identifier - if (! is.null(params$features) && (! is.null(params$differential_feature_name_column)) ){ - label_col <- params$differential_feature_name_column - }else{ - label_col <- params$differential_feature_id_column - } - - # Get the full set of differential stats for this contrast, removing rows with - # NAs in the fields we need. - full_de <- differential_results[[i]] - full_de <- subset(full_de, (! is.na(full_de[[params$differential_fc_column]])) & (! is.na(full_de[[params$differential_qval_colum]])) ) - #full_de[[params$differential_fc_column]] <- -log2(full_de[[params$differential_fc_column]]) - #full_de[[params$differential_pval_column]] <- -log10(full_de[[params$differential_pval_column]]) - - #full_de$color <- with(full_de, ifelse(params$differential_fc_column>=params$differential_min_fold_change & params$differential_pval_column<=params$differential_max_qval, 4, ifelse(params$differential_fc_column>=params$differential_min_fold_change, 2, ifelse(params$differential_pval_column<=params$differential_max_qval, 3, 1)))) - - full_de <- full_de - full_de$color <- 1 # default (black) - full_de$color[abs(full_de[[params$differential_fc_column]]) >= log2(params$differential_min_fold_change)] <- 2 # high FC (green) - full_de$color[full_de[[params$differential_fc_column]] >= params$differential_max_qval] <- 3 # low p val (blue) - full_de$color[abs(full_de[[params$differential_fc_column]]) >= log2(params$differential_min_fold_change) & full_de[[params$differential_fc_column]] >= params$differential_max_qval] <- 4 # high FC & low p val (red) - - write.table(full_de, file="/home-link/iivow01/git/differentialabundance/error/full_de.tsv", quote=F, sep="\t") - - # We'll color by whether features are differential according to supplied thresholds - - p_value_types <- list(Adjusted = params$differential_qval_column, Unadjusted = params$differential_pval_column) - p_value_thresholds <- list(Adjusted = params$differential_max_qval, Unadjusted = params$differential_max_pval) - - -for (pvt in names(p_value_types)){ - cat("\n##### ", pvt, " p values\n") - pval_column <- p_value_types[[pvt]] - - full_de$differential_status <- FALSE - full_de$differential_status[abs(full_de[[params$differential_fc_column]]) > log2(params$differential_min_fold_change) & full_de[[pval_column]] < p_value_thresholds[[pvt]]] <- TRUE - - # Define the thresholds we'll draw - - hline_thresholds = vline_thresholds = list() - hline_thresholds[[paste(pval_column, '=', p_value_thresholds[[pvt]])]] = -log10(p_value_thresholds[[pvt]]) - vline_thresholds[[paste(params$differential_fc_column, '<-', log2(params$differential_min_fold_change))]] = -log2(params$differential_min_fold_change) - vline_thresholds[[paste(params$differential_fc_column, '>', log2(params$differential_min_fold_change))]] = log2(params$differential_min_fold_change) - - write.table(full_de, file="/home-link/iivow01/git/differentialabundance/error/full_de.tsv", sep="\t", quote=F) - write.table(full_de[[pval_column]][full_de$color==1], file="/home-link/iivow01/git/differentialabundance/error/full_de_sub.tsv", sep="\t", quote=F) - capture.output(full_de$color, file="/home-link/iivow01/git/differentialabundance/error/colorbla.tsv") - - write.table(full_de$color, file="/home-link/iivow01/git/differentialabundance/error/color.tsv", sep="\t", quote=F) - plot_args <- list(type = "scatter", mode = 'markers') - - # Let's equalize the axes - max_fc <- max(abs(full_de[[params$differential_fc_column]])) * 1.1 - p <- do.call(plot_ly, plot_args) %>% - layout(xaxis = list(range=list(-max_fc, max_fc), - title = paste("higher in", contrasts$reference[i], " <<", params$differential_fc_column, ">> higher in", contrasts$target[i])) - # shapes = list( - # hline(-log10(p_value_thresholds[[pvt]])), - # hline(log10(p_value_thresholds[[pvt]])), - # vline(log2(params$differential_min_fold_change)) - # ) - ) %>% - add_trace(mode = "markers", name = "Not significant", x = full_de[[params$differential_fc_column]][full_de$color==1], y = -log10(full_de[[pval_column]][full_de$color==1]), marker = list(color = "black")) %>% - add_trace(mode = "markers", name = paste0("abs(", params$differential_fc_column, ")>", params$differential_min_fold_change), x = full_de[[params$differential_fc_column]][full_de$color==2], y = -log10(full_de[[pval_column]][full_de$color==2]), marker = list(color = "green")) %>% - add_trace(mode = "markers", name = paste0(pval_column, "<=", params$differential_max_qval), x = full_de[[params$differential_fc_column]][full_de$color==3], y = -log10(full_de[[pval_column]][full_de$color==3]), marker = list(color = "blue")) %>% - add_trace(mode = "markers", name = paste0("abs(", params$differential_fc_column, ")>", params$differential_min_fold_change, "\n& ", pval_column, "<=", params$differential_max_qval), x = full_de[[params$differential_fc_column]][full_de$color==4], y = -log10(full_de[[pval_column]][full_de$color==4]), marker = list(color = "red")) - -# differential_status[abs(full_de[[params$differential_fc_column]]) > log2(params$differential_min_fold_change) & full_de[[pval_column]] < p_value_thresholds[[pvt]]] - - - - print(htmltools::tagList(p)) - - - - - ## ... then show tables of the up/ down genes - - for (dir in c('up', 'down')){ - contrast_de <- sig_differential[[pvt]][[i]][[dir]] - cols_to_round <- c(params$differential_fc_column, params$differential_pval_column, params$differential_qval_column) - contrast_de[, cols_to_round] <- signif(contrast_de[, cols_to_round], 8) - - colnames(contrast_de) <- prettifyVariablename(colnames(contrast_de)) - - if (nrow(contrast_de) > 0){ - print( htmltools::tagList(datatable(contrast_de, caption = paste('Differential genes', dir, 'in', contrast_descriptions[i], " (check", differential_files[[i]], "for more detail)"), rownames = FALSE) )) - }else{ - cat(paste0("No significantly differential '", dir, "' genes.\n\n")) - } - } - } - -} -``` - - - -```{r, echo=FALSE, results='asis'} -possible_gene_set_methods <- c('gsea') -if (any(unlist(params[paste0(possible_gene_set_methods, '_run')]))){ - cat("\n### Gene set analysis\n") - - for (gene_set_method in possible_gene_set_methods){ - if (unlist(params[paste0(gene_set_method, '_run')])){ - cat("\n### ", toupper(gene_set_method) ," {.tabset}\n") - - for (gmt_file in simpleSplit(params$gsea_gene_sets)) { - gmt_name <- basename(tools::file_path_sans_ext(gmt_file)) - - cat("\n#### ", gmt_name ," {.tabset}\n") - reference_gsea_tables <- paste0(contrasts$id, ".", gmt_name, '.gsea_report_for_', contrasts$reference, '.tsv') - target_gsea_tables <- paste0(contrasts$id, ".", gmt_name, '.gsea_report_for_', contrasts$target, '.tsv') - - for (i in 1:nrow(contrasts)){ - cat("\n##### ", contrast_descriptions[i], "\n") - - target_gsea_results <- read_metadata(target_gsea_tables[i])[,c(-2,-3)] - print( htmltools::tagList(datatable(target_gsea_results, caption = paste0("\nTarget (", contrasts$target[i], ")\n"), rownames = FALSE) )) - - ref_gsea_results <- read_metadata(reference_gsea_tables[i])[,c(-2,-3)] - print( htmltools::tagList(datatable(ref_gsea_results, caption = paste0("\nReference (", contrasts$reference[i], ")\n"), rownames = FALSE) )) - } - } - } - } -} -``` - -# Methods - -## Filtering - -```{r, echo=FALSE, results='asis'} -make_params_table('feature-wise filtering', 'filtering_', remove_pattern = TRUE) -``` - -```{r, echo=FALSE, results='asis'} -filtering_string <- paste0('Filtering was carried out by selecting ', params$features_type, 's with an abundance of at least ', params$filtering_min_abundance) - -if (is.null(params$filtering_grouping_var)){ - if (is.null(params$filtering_min_proportion)){ - filtering_string <- paste0(filtering_string, ' in at least ', params$filtering_min_samples, ' ', params$observations_type, 's.') - }else{ - filtering_string <- paste0(filtering_string, ' in at least a proportion of ', params$filtering_min_proportion, ' of ', params$observations_type,'s.') - } -}else{ - if (is.null(params$filtering_min_proportion)){ - filtering_string <- paste0(filtering_string, ' in at least the number of ', params$observations_type, 's corresponding to the smallest group size defined by the grouping variable "', params$filtering_grouping_var, '".') - }else{ - filtering_string <- paste0(filtering_string, ' in at least a proportion of ', params$filtering_min_proportion, ' of the number of ', params$observations_type,'s corresponding to the smallest group size defined by the grouping variable"', params$filtering_grouping_var, '".') - } -} -cat(filtering_string) -``` - -## Exploratory analysis - -```{r, echo=FALSE, results='asis'} -make_params_table('exploratory analysis', 'exploratory_', remove_pattern = TRUE) -``` - -## Differential analysis - -```{r, echo=FALSE, results='asis'} -if (params$study_type == 'rnaseq'){ - make_params_table('DESeq2', 'deseq2_', remove_pattern = TRUE) -} -make_params_table('downstream differential analysis', 'differential_', remove_pattern = TRUE) -``` - - - -```{r, echo=FALSE, results='asis'} -possible_gene_set_methods <- c('gsea') - -if (any(unlist(params[paste0(possible_gene_set_methods, '_run')]))){ - cat("\n### Gene set analysis\n") - - for (gene_set_method in possible_gene_set_methods){ - if (unlist(params[paste0(gene_set_method, '_run')])){ - cat("\n### ", toupper(gene_set_method) ," {.tabset}\n") - make_params_table(toupper(gene_set_method), paste0(gene_set_method, '_'), remove_pattern = TRUE) - } - } - -} -``` - -# Appendices - -## All parameters - -```{r, echo=FALSE, results='asis'} -print( htmltools::tagList(datatable(params_table, caption = "All parameters", rownames = FALSE) )) -``` - -## Software versions - -**Note:** For a more detailed accounting of the software and commands used (including containers), consult the execution report produced as part of the 'pipeline info' for this workflow. - -```{r, echo=FALSE, results='asis'} -versions_table <- data.frame(do.call(rbind, strsplit(names(versions), split = '\\.')), unlist(versions)) -colnames(versions_table) <- c('Component', 'Software', 'Version') -print( htmltools::tagList(datatable(versions_table, caption = "Software versions", rownames = FALSE, options = list(dom = 'ft', paging = FALSE)) )) -``` - -```{r, echo=FALSE, results='asis'} -htmltools::includeMarkdown(params$citations) -``` diff --git a/conf/modules.config b/conf/modules.config index ad6ac534..692a2ccc 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -34,16 +34,7 @@ process { ] ext.args = "--feature-type transcript" } - - withName: PROTEUS { - publishDir = [ - [ - path: { "${params.outdir}/proteus" } - ] - ] - //ext.args = "--sample_id_col \"Sample Number\" --measure_col_prefix \"LFQ intensity \"" - ext.args = "--sample_id_col \"${params.observations_id_col}\" --measure_col_prefix \"${params.maxquant_measurecol_prefix}\" " - } + withName: VALIDATOR { publishDir = [ @@ -109,6 +100,36 @@ process { "--build_annotation ${params.affy_build_annotation}" ].join(' ').trim() } } + + withName: PROTEUS { + publishDir = [ + [ + path: { "${params.outdir}/tables/proteus" }, + mode: params.publish_dir_mode, + pattern: '*proteingroups_tab.tsv' + ], + [ + path: { "${params.outdir}/plots/proteus" }, + mode: params.publish_dir_mode, + pattern: '*.png' + ], + [ + path: { "${params.outdir}/other/proteus" }, + mode: params.publish_dir_mode, + pattern: '*.{rds,sessionInfo.log}' + ] + ] + ext.args = { [ + "--sample_id_col \"${params.observations_id_col}\"", + "--protein_id_col \"${params.features_id_col}\"", + "--measure_col_prefix \"${params.proteus_measurecol_prefix}\"", + "--normfuns $params.proteus_norm_functions", + "--plotSampleDistributions_method $params.proteus_plotSD_method", + "--plotMV_loess $params.proteus_plotMV_loess", + "--palette_name $params.proteus_palette_name", + "--round_digits $params.proteus_round_digits" + ].join(' ').trim() } + } withName: DESEQ2_DIFFERENTIAL { publishDir = [ diff --git a/modules.json b/modules.json index 1485fc65..8851fcd9 100644 --- a/modules.json +++ b/modules.json @@ -55,6 +55,11 @@ "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, + "proteus/readproteingroups": { + "branch": "master", + "git_sha": "007dd9c990670392d3fb6607529966a1a614e1e1", + "installed_by": ["modules"] + }, "rmarkdownnotebook": { "branch": "master", "git_sha": "603ecbd9f45300c9788f197d2a15a005685b4220", diff --git a/modules/nf-core/proteus/main.nf b/modules/nf-core/proteus/main.nf deleted file mode 100644 index 3a22e1bb..00000000 --- a/modules/nf-core/proteus/main.nf +++ /dev/null @@ -1,31 +0,0 @@ -process PROTEUS { - tag "$meta" - label 'process_medium' -//TODO: Change containers -// conda "proteus2" -// conda "bioconda::r-proteus-bartongroup=0.2.16 conda-forge::r-plotly=4.10.1 bioconda::bioconductor-limma=3.54.0" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/r-proteus-bartongroup:0.2.16--r42hdfd78af_0' : - 'quay.io/biocontainers/mulled-v2-0ad0abd3e3e02e24e1626edaef6d6f9a967733fb:246a3b59c610a9cd35bdf8110be3a908e3769ae0-0' }" - - input: - tuple val(meta), path(samplesheet), path(intensities) - - output: - tuple val(meta), path("*normalised_distributions.png") , emit: raw_dist_plot - tuple val(meta), path("*normalised_distributions.png") , emit: norm_dist_plot - tuple val(meta), path("*mean_variance_relationship.png") , emit: mean_var_relationship_plot - tuple val(meta), path("*dendrogram.png") , emit: dendro_plot - tuple val(meta), path("*raw_proteingroups.rds") , emit: rdata - tuple val(meta), path("*raw_proteingroups_tab.tsv") , emit: tab - tuple val(meta), path("*normalised_proteingroups_tab.tsv") , emit: normtab -// tuple val(meta), path("*normalised_proteingroups_tab2.tsv"), emit: normtab2 - tuple val(meta), path("*R_sessionInfo.log") , emit: session_info - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - template 'proteus_readproteingroups.R' -} diff --git a/modules/nf-core/proteus/meta.yml b/modules/nf-core/proteus/meta.yml deleted file mode 100644 index b657d809..00000000 --- a/modules/nf-core/proteus/meta.yml +++ /dev/null @@ -1,59 +0,0 @@ -name: "limma_differential" -description: runs a differential expression analysis with Limma -keywords: - - intensities - - proteomics - - limma - -tools: - - "limma": - description: "Linear Models for Microarray Data" - homepage: "https://bioconductor.org/packages/release/bioc/html/limma.html" - documentation: "https://bioconductor.org/packages/release/bioc/vignettes/limma/inst/doc/usersguide.pdf" - tool_dev_url: https://github.com/cran/limma"" - doi: "10.18129/B9.bioc.limma" - licence: "['LGPL >=3']" - -input: - - meta: - type: map - description: | - Groovy Map containing contrast information, which can be referred to in - calls at the pipeline level e.g. [ variable:'treatment', reference:'treated', - control:'saline', blocking:'' ] passed in as ext.args like: '--reference_level - $meta.reference --treatment_level $meta.target' - - samplesheeet: - type: file - description: | - CSV or TSV format sample sheet with sample metadata - - intensities: - type: file - description: | - Raw TSV or CSV format expression matrix with probes by row and samples - by column - -output: - - results: - type: file - description: TSV-format table of differential expression information as - output by Limma - pattern: "*.limma.results.tsv" - - md_plot: - type: file - description: Limma mean difference plot - pattern: "*.mean_difference.png" - - rdata: - type: file - description: Serialised MArrayLM object - pattern: "*.MArrayLM.limma.rds" - - session_info: - type: file - description: dump of R SessionInfo - pattern: "*.log" - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" - -authors: - - "@pinin4fjords" diff --git a/modules/nf-core/proteus/readproteingroups/main.nf b/modules/nf-core/proteus/readproteingroups/main.nf index 02c61e19..c7018a82 100644 --- a/modules/nf-core/proteus/readproteingroups/main.nf +++ b/modules/nf-core/proteus/readproteingroups/main.nf @@ -2,10 +2,10 @@ process PROTEUS_READPROTEINGROUPS { tag "$meta" label 'process_single' - conda "r-base=4.2.1 r-proteus-bartongroup=0.2.16 bioconductor-limma=3.54.0 r-plotly=4.10.2" + conda "conda-forge::r-base=4.2.1 bioconda::r-proteus-bartongroup=0.2.16 conda-forge::r-plotly=4.10.2 bioconda::bioconductor-limma=3.54.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/mulled-v2-4e01206f2c47f56077f04e5d2d7b312f50513a1e:92abccefbeb09795ad6a93553b62a6ad3daaea48-0': - 'quay.io/biocontainers/mulled-v2-4e01206f2c47f56077f04e5d2d7b312f50513a1e:92abccefbeb09795ad6a93553b62a6ad3daaea48-0' }" + 'biocontainers/mulled-v2-4e01206f2c47f56077f04e5d2d7b312f50513a1e:92abccefbeb09795ad6a93553b62a6ad3daaea48-0' }" input: tuple val(meta), path(samplesheet), path(intensities) @@ -15,12 +15,12 @@ process PROTEUS_READPROTEINGROUPS { output: tuple val(meta), path("*dendrogram.png") , emit: dendro_plot tuple val(meta), path("*mean_variance_relationship.png") , emit: mean_var_plot - tuple val(meta), path("*normalised_distributions.png") , emit: raw_dist_plot - tuple val(meta), path("*normalised_distributions.png") , emit: norm_dist_plot + tuple val(meta), path("*raw_distributions.png") , emit: raw_dist_plot + tuple val(meta), path("*normalized_distributions.png") , emit: norm_dist_plot tuple val(meta), path("*raw_proteingroups.rds") , emit: raw_rdata + tuple val(meta), path("*normalized_proteingroups.rds") , emit: norm_rdata tuple val(meta), path("*raw_proteingroups_tab.tsv") , emit: raw_tab - tuple val(meta), path("*normalised_proteingroups.rds") , emit: norm_rdata - tuple val(meta), path("*normalised_proteingroups_tab.tsv") , emit: norm_tab + tuple val(meta), path("*normalized_proteingroups_tab.tsv") , emit: norm_tab tuple val(meta), path("*R_sessionInfo.log") , emit: session_info path "versions.yml" , emit: versions diff --git a/modules/nf-core/proteus/readproteingroups/meta.yml b/modules/nf-core/proteus/readproteingroups/meta.yml index 9ec8ac8e..02031d9c 100644 --- a/modules/nf-core/proteus/readproteingroups/meta.yml +++ b/modules/nf-core/proteus/readproteingroups/meta.yml @@ -1,9 +1,9 @@ name: "proteus_readproteingroups" description: reads a maxQuant proteinGroups file with Proteus keywords: - - intensities - proteomics - proteus + - readproteingroups tools: - "proteus": description: "R package for analysing proteomics data" @@ -17,8 +17,7 @@ input: - meta: type: map description: | - Groovy Map containing contrast information, e.g. [ variable:'treatment', reference:'treated', - control:'saline', blocking:'' ] + Groovy Map containing contrast information, e.g. [ variable:'treatment', reference:'treated', control:'saline', blocking:'' ] - samplesheet: type: file description: | @@ -27,36 +26,45 @@ input: type: file description: | proteinGroups TXT file with protein intensities information from maxQuant; check here for specifications: https://rdrr.io/github/bartongroup/Proteus/man/readProteinGroups.html + - meta2: + type: map + description: | + Groovy Map containing contrast information, e.g. [ variable:'treatment', reference:'treated', control:'saline', blocking:'' ] + - contrast_variable: + type: string + description: | + The column in the sample sheet that should be used to define groups for + comparison output: - - raw_dist_plot: - type: file - description: | - PNG file; plot of the intensity/ratio distributions of the raw samples - - norm_dist_plot: + - dendro_plot: type: file description: | - PNG file; plot of the intensity/ratio distributions of the normalized samples + PNG file; dendrogram of the normalized samples hierarchically clustered by their intensities - mean_var_plot: type: file description: | PNG file; plot of the log-intensity variance vs log-intensity mean of each condition in the normalized samples - - dendro_plot: + - raw_dist_plot: type: file description: | - PNG file; dendrogram of the normalized samples hierarchically clustered by their intensities - - raw_rdata: + PNG file; plot of the intensity/ratio distributions of the raw samples + - norm_dist_plot: type: file description: | - RDS file of a proteinGroups object from Proteus, contains raw protein intensities and additional info - - raw_tab: + PNG file; plot of the intensity/ratio distributions of the normalized samples + - raw_rdata: type: file description: | - TSV-format intensities table from Proteus, contains raw protein intensities + RDS file of a proteinGroups object from Proteus, contains raw protein intensities and additional info - norm_rdata: type: file description: | RDS file of a proteinGroups object from Proteus, contains normalized protein intensities and additional info + - raw_tab: + type: file + description: | + TSV-format intensities table from Proteus, contains raw protein intensities - norm_tab: type: file description: | diff --git a/modules/nf-core/proteus/readproteingroups/templates/proteus_readproteingroups.R b/modules/nf-core/proteus/readproteingroups/templates/proteus_readproteingroups.R index 14eafdf2..ff75ea84 100644 --- a/modules/nf-core/proteus/readproteingroups/templates/proteus_readproteingroups.R +++ b/modules/nf-core/proteus/readproteingroups/templates/proteus_readproteingroups.R @@ -1,5 +1,31 @@ #!/usr/bin/env Rscript +# Written by Oskar Wacker (https://github.com/WackerO) in +# collaboration with Stefan Czemmel (https://github.com/qbicStefanC) +# Script template by Jonathan Manning (https://github.com/pinin4fjords) + +# MIT License + +# Copyright (c) QBiC + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + ################################################ ################################################ ## Functions ## @@ -57,18 +83,21 @@ read_delim_flexible <- function(file, header = TRUE, row.names = NULL, check.nam #' #' @param dataframe A data frame #' @param columns Which columns to round (assumes all of them by default) -#' @param digits How many decimal places to round to? +#' @param digits How many decimal places to round to? If -1, will return the unchanged input df #' #' @return output Data frame -# TODO check if this is necessary -round_dataframe_columns <- function(df, columns = NULL, digits = 8) { +round_dataframe_columns <- function(df, columns = NULL, digits = -1) { + if (digits == -1) { + return(df) # if -1, return df without rounding + } + + df <- data.frame(df, check.names = FALSE) # make data.frame from vector as otherwise, the format will get messed up if (is.null(columns)) { columns <- colnames(df) } - - df[,columns] <- format( + df[,columns] <- round( data.frame(df[, columns], check.names = FALSE), - nsmall = digits + digits = digits ) # Convert columns back to numeric @@ -102,7 +131,8 @@ opt <- list( normfuns = 'normalizeMedian', plotSampleDistributions_method = 'violin', plotMV_loess = T, - palette_name = 'Set1' + palette_name = 'Set1', + round_digits = -1 ) opt_types <- lapply(opt, class) @@ -172,11 +202,11 @@ sample.sheet <- ) if (! opt\$protein_id_col %in% colnames(intensities.table)) { - stop(paste0("Specified protein ID column '", opt\$protein_id_col, "' is not in the intensities table; exiting...Valid columns are: ", paste(colnames(intensities.table), collapse=", "))) + stop(paste0("Specified protein ID column '", opt\$protein_id_col, "' is not in the intensities table")) } if (! opt\$sample_id_col %in% colnames(sample.sheet)) { - stop(paste0("Specified sample ID column '", opt\$sample_id_col, "' is not in the sample sheet; exiting...Valid columns are: ", paste(colnames(sample.sheet), collapse=", "))) + stop(paste0("Specified sample ID column '", opt\$sample_id_col, "' is not in the sample sheet")) } # Add metadata columns that are necessary for proteus @@ -185,6 +215,7 @@ sample.sheet\$sample <- sample.sheet[[opt\$sample_id_col]] sample.sheet\$condition <- sample.sheet[[opt\$contrast_variable]] # Add prefix for proteinGroups measurement columns to the sample IDs from the sampesheet + measure.cols <- setNames(paste0(opt\$measure_col_prefix, sample.sheet[[opt\$sample_id_col]]), sample.sheet[[opt\$sample_id_col]]) # Check that all samples specified in the input sheet are present in the intensities table @@ -201,25 +232,6 @@ if (length(missing_columns) > 0) { )) } -################################################ -################################################ -## CHECK AND FORMAT NORMFUN AND FILTERFUN ## -################################################ -################################################ - -valid_normfuns <- c("normalizeMedian", "normalizeQuantiles") -normfuns <- opt\$normfuns - -# Check validity of normfun(s) -invalid_normfuns <- normfuns[!(normfuns %in% valid_normfuns)] -if (length(invalid_normfuns)>0) { - stop(paste0("Invalid normfuns argument(s): ", - paste(invalid_normfuns, collapse=", "), - ". Valid normfuns are: ", - paste(valid_normfuns, collapse=", "), - "; exiting...")) -} - ################################################ ################################################ ## Run Proteus processes and generate outputs ## @@ -238,51 +250,60 @@ proteinGroups <- readProteinGroups( data.cols=proteinColumns ) +# Define valid normalization functions + +valid_normfuns <- list("normalizeMedian", "normalizeQuantiles") + # Generate plots for all requested normalizations; also, save normalized protein groups for limma -for (normfun in normfuns) { +for (normfun in unlist(strsplit(opt\$normfuns, ","))) { + if (! (normfun %in% valid_normfuns)) { + stop(paste0("Invalid normfuns argument: ", normfun, + ". Valid normfuns are: ", paste(valid_normfuns, collapse=", "), ".")) + } + proteinGroups.normalized <- normalizeData(proteinGroups, norm.fun = eval(parse(text=normfun))) # Proteus also accepts other norm.funs, e.g. from limma # Apply log2 and remove NAs as these will otherwise mess with some of the following modules proteinGroups.normalized\$tab <- na.omit(log2(proteinGroups.normalized\$tab)) - - png(paste(output_prefix, 'proteus', normfun, 'normalized_distributions.png', sep = '.'), width = 5*300, height = 5*300, res = 300, pointsize = 8) + + png(paste(output_prefix, 'proteus', normfun, 'normalized_distributions.png', sep='.'), width = 5*300, height = 5*300, res = 300, pointsize = 8) print( plotSampleDistributions(proteinGroups.normalized, title=paste0("Sample distributions after applying\n", normfun), fill="condition", method=opt\$plotSampleDistributions_method) - + scale_fill_brewer(palette=opt\$palette_name, name=opt\$contrast_variable) - + theme(plot.title = element_text(size = 12)) + + scale_fill_brewer(palette=opt\$palette_name, name=opt\$contrast_variable) + + theme(plot.title = element_text(size = 12)) ) dev.off() - - png(paste(output_prefix, 'proteus', normfun, 'normalized_mean_variance_relationship.png', sep = '.'), width = 5*300, height = 5*300, res = 300, pointsize = 8) + + png(paste(output_prefix, 'proteus', normfun, 'normalized_mean_variance_relationship.png', sep='.'), width = 5*300, height = 5*300, res = 300, pointsize = 8) print( - plotMV(proteinGroups.normalized, with.loess=opt\$plotMV_loess) - + ggtitle(paste0("Sample mean variance relationship after applying\n", normfun)) - + scale_fill_distiller(palette=opt\$palette_name) - + theme(plot.title = element_text(size = 12)) + plotMV(proteinGroups.normalized, with.loess=opt\$plotMV_loess) + + ggtitle(paste0("Sample mean variance relationship after applying\n", normfun)) + + scale_fill_distiller(palette=opt\$palette_name) + + theme(plot.title = element_text(size = 12)) ) dev.off() - png(paste(output_prefix, 'proteus', normfun, 'normalized_dendrogram.png', sep = '.'), width = 5*300, height = 5*300, res = 300, pointsize = 8) + png(paste(output_prefix, 'proteus', normfun, 'normalized_dendrogram.png', sep='.'), width = 5*300, height = 5*300, res = 300, pointsize = 8) print( plotClustering(proteinGroups.normalized) - + ggtitle(paste0("Sample clustering after applying\n", normfun)) - + theme(plot.title = element_text(size = 12)) + + ggtitle(paste0("Sample clustering after applying\n", normfun)) + + theme(plot.title = element_text(size = 12)) ) dev.off() - + # R object for other processes to use - - saveRDS(proteinGroups.normalized, file = paste(output_prefix, 'proteus', normfun, 'normalized_proteingroups.rds', sep=".")) + + saveRDS(proteinGroups.normalized, file = paste(output_prefix, 'proteus', normfun, 'normalized_proteingroups.rds', sep='.')) # Write normalized intensities matrix - + out_df <- data.frame( - proteinGroups.normalized\$tab, + round_dataframe_columns(proteinGroups.normalized\$tab, digits=opt\$round_digits), check.names = FALSE ) - out_df[[opt\$protein_id_col]] <- rownames(proteinGroups.normalized\$tab) # proteus saves the IDs as rownames; make column from those + out_df[[opt\$protein_id_col]] <- rownames(proteinGroups.normalized\$tab) # proteus saves the IDs as rownames; save these to a separate column out_df <- out_df[c(opt\$protein_id_col, colnames(out_df)[colnames(out_df) != opt\$protein_id_col])] # move ID column to first position write.table( out_df, @@ -300,13 +321,14 @@ proteinGroups\$tab <- na.omit(log2(proteinGroups\$tab)) # Generate raw distribution plot -png(paste(output_prefix, 'proteus.raw_distributions.png', sep = '.'), width = 5*300, height = 5*300, res = 300, pointsize = 8) +png(paste(output_prefix, 'proteus.raw_distributions.png', sep='.'), width = 5*300, height = 5*300, res = 300, pointsize = 8) print( plotSampleDistributions(proteinGroups, title="Raw sample distributions", fill="condition", method=opt\$plotSampleDistributions_method) + scale_fill_brewer(palette=opt\$palette_name, name=opt\$contrast_variable) - + theme(plot.title = element_text(size = 12)) + + theme(plot.title = element_text(size = 12)) ) dev.off() + # R object for other processes to use saveRDS(proteinGroups, file = paste(output_prefix, 'proteus.raw_proteingroups.rds', sep = '.')) @@ -314,10 +336,10 @@ saveRDS(proteinGroups, file = paste(output_prefix, 'proteus.raw_proteingroups.rd # Write raw intensities matrix out_df <- data.frame( - proteinGroups\$tab, + round_dataframe_columns(proteinGroups\$tab, digits=opt\$round_digits), check.names = FALSE ) -out_df[[opt\$protein_id_col]] <- rownames(proteinGroups\$tab) # proteus saves the IDs as rownames; make column from those +out_df[[opt\$protein_id_col]] <- rownames(proteinGroups\$tab) # proteus saves the IDs as rownames; save these to a separate column out_df <- out_df[c(opt\$protein_id_col, colnames(out_df)[colnames(out_df) != opt\$protein_id_col])] # move ID column to first position @@ -354,13 +376,13 @@ writeLines( c( '"${task.process}":', paste(' r-base:', r.version), - paste(' bioconductor-limma:', limma.version), + paste(' r-proteus-bartongroup:', proteus.version), paste(' r-plotly:', plotly.version), - paste(' r-proteus-bartongroup:', proteus.version) + paste(' bioconductor-limma:', limma.version) ), 'versions.yml') ################################################ ################################################ ################################################ -################################################ \ No newline at end of file +################################################ diff --git a/modules/nf-core/proteus/templates/proteus_readproteingroups.R b/modules/nf-core/proteus/templates/proteus_readproteingroups.R deleted file mode 100755 index 263e18e0..00000000 --- a/modules/nf-core/proteus/templates/proteus_readproteingroups.R +++ /dev/null @@ -1,376 +0,0 @@ -#!/usr/bin/env Rscript - - - - -# TODO: Add link to https://rdrr.io/github/bartongroup/Proteus/man/readProteinGroups.html to docu and mention the necessary columns! - - - - - - -################################################ -################################################ -## Functions ## -################################################ -################################################ - -#' Parse out options from a string without recourse to optparse -#' -#' @param x Long-form argument list like --opt1 val1 --opt2 val2 -#' -#' @return named list of options and values similar to optparse - -parse_args <- function(x){ - args_list <- unlist(strsplit(x, ' ?--')[[1]])[-1] - args_vals <- lapply(args_list, function(x) scan(text=x, what='character', quiet = TRUE)) - - # Ensure the option vectors are length 2 (key/ value) to catch empty ones - args_vals <- lapply(args_vals, function(z){ length(z) <- 2; z}) - - parsed_args <- structure(lapply(args_vals, function(x) x[2]), names = lapply(args_vals, function(x) x[1])) - parsed_args[! is.na(parsed_args)] -} - -#' Flexibly read CSV or TSV files -#' -#' @param file Input file -#' @param header Passed to read.delim() -#' @param row.names Passed to read.delim() -#' -#' @return output Data frame - -read_delim_flexible <- function(file, header = TRUE, row.names = NULL, check.names = F){ - - ext <- tolower(tail(strsplit(basename(file), split = "\\\\.")[[1]], 1)) - - if (ext == "tsv" || ext == "txt") { - separator <- "\\t" - } else if (ext == "csv") { - separator <- "," - } else { - stop(paste("Unknown separator for", ext)) - } - - read.delim( - file, - sep = separator, - header = header, - row.names = row.names, - check.names = check.names - ) -} - -#' Round numeric dataframe columns to fixed decimal places by applying -#' formatting and converting back to numerics -#' -#' @param dataframe A data frame -#' @param columns Which columns to round (assumes all of them by default) -#' @param digits How many decimal places to round to? -#' -#' @return output Data frame -# TODO check if this is necessary -round_dataframe_columns <- function(df, columns = NULL, digits = 8){ - if (is.null(columns)){ - columns <- colnames(df) - } - - df[,columns] <- format( - data.frame(df[, columns], check.names = FALSE), - nsmall = digits - ) - - # Convert columns back to numeric - - for (c in columns) { - df[[c]][grep("^ *NA\$", df[[c]])] <- NA - df[[c]] <- as.numeric(df[[c]]) - } - df -} - -################################################ -################################################ -## PARSE PARAMETERS FROM NEXTFLOW ## -################################################ -################################################ - -# I've defined these in a single array like this so that we could go back to an -# optparse-driven method in future with module bin/ directories, rather than -# the template - -# Set defaults and classes - -opt <- list( - intensities_file = '$intensities', - sample_file = '$samplesheet', - contrast_variable = NULL, - protein_id_col = 'Majority protein IDs', - sample_id_col = 'sample', - measure_col_prefix = 'intensities', - normfuns = 'normalizeMedian', - plotSampleDistributions_method = 'violin', - plotMV_loess = T, - palette_name = 'Set1' -) -opt_types <- lapply(opt, class) - -# Apply parameter overrides - -args_opt <- parse_args('$task.ext.args') -for ( ao in names(args_opt)){ - if (! ao %in% names(opt)){ - stop(paste("Invalid option:", ao)) - }else{ - - # Preserve classes from defaults where possible - if (! is.null(opt[[ao]])){ - args_opt[[ao]] <- as(args_opt[[ao]], opt_types[[ao]]) - } - opt[[ao]] <- args_opt[[ao]] - } -} - -# Check if required parameters have been provided - -required_opts <- c('intensities_file', 'sample_file', 'contrast_variable') -missing <- required_opts[unlist(lapply(opt[required_opts], is.null)) | ! required_opts %in% names(opt)] - -if (length(missing) > 0){ - stop(paste("Missing required options:", paste(missing, collapse=', '))) -} - -# Check file inputs are valid - -for (file_input in c('intensities_file', 'sample_file')){ - if (is.null(opt[[file_input]])) { - stop(paste("Please provide", file_input), call. = FALSE) - } - - if (! file.exists(opt[[file_input]])){ - stop(paste0('Value of ', file_input, ': ', opt[[file_input]], ' is not a valid file')) - } -} - -################################################ -################################################ -## Finish loading libraries ## -################################################ -################################################ - -library(limma) -library(plotly) -library(proteus) - -################################################ -################################################ -# READ IN INTENSITIES FILE AND SAMPLE METADATA # -################################################ -################################################ - -intensities.table <- - read_delim_flexible( - file = opt\$intensities_file, - check.names = FALSE - ) - -sample.sheet <- - read_delim_flexible( - file = opt\$sample_file, - check.names=FALSE - ) - -if (! opt\$protein_id_col %in% colnames(intensities.table)){ - stop(paste0("Specified protein ID column '", opt\$protein_id_col, "' is not in the intensities table")) -} - -if (! opt\$sample_id_col %in% colnames(sample.sheet)){ - stop(paste0("Specified sample ID column '", opt\$sample_id_col, "' is not in the sample sheet")) -} - -# Add metadata columns that are necessary for proteus - -sample.sheet\$sample <- sample.sheet[[opt\$sample_id_col]] -sample.sheet\$condition <- sample.sheet[[opt\$contrast_variable]] - -# Add prefix for proteinGroups measurement columns to the sample IDs from the sampesheet -measure.cols <- setNames(paste0(opt\$measure_col_prefix, sample.sheet[[opt\$sample_id_col]]), sample.sheet[[opt\$sample_id_col]]) - -# Check that all samples specified in the input sheet are present in the intensities table - -missing_columns <- paste0(opt\$measure_col_prefix, sample.sheet[[opt\$sample_id_col]]) -missing_columns <- missing_columns[!missing_columns %in% colnames(intensities.table)] -if (length(missing_columns) > 0) { - stop(paste( - length(missing_columns), - 'specified samples do not have a(n)', - opt\$measure_col_prefix, - 'column in intensities table. The following columns are missing:', - paste(missing_columns, collapse = ', ') - )) -} - -################################################ -################################################ -## CHECK AND FORMAT NORMFUN AND FILTERFUN ## -################################################ -################################################ - -valid_normfuns <- c("normalizeMedian", "normalizeQuantiles") -normfuns <- opt\$normfuns - -# Check validity of normfun(s) -invalid_normfuns <- normfuns[!(normfuns %in% valid_normfuns)] -if (length(invalid_normfuns)>0) { - stop(paste0("Invalid normfuns argument(s): ", - paste(invalid_normfuns, collapse=", "), - ". Valid normfuns are: ", - paste(valid_normfuns, collapse=", "), - ".")) -} - -################################################ -################################################ -## Run Proteus processes and generate outputs ## -################################################ -################################################ - -output_prefix <- opt\$contrast_variable - -# Replace proteus default ID column with user param and re-set the names of the resulting object (gsub sets the names to NULL) - -proteinColumns <- setNames(gsub("Majority protein IDs", opt\$protein_id_col, proteus::proteinColumns), names(proteus::proteinColumns)) -proteinGroups <- readProteinGroups( - file=opt\$intensities_file, - meta=sample.sheet, - measure.cols=measure.cols, - data.cols=proteinColumns -) - -# Generate plots for all requested normalizations; also, save normalized protein groups for limma - -for (normfun in normfuns) { - proteinGroups.normalized <- normalizeData(proteinGroups, norm.fun = eval(parse(text=normfun))) # Proteus also accepts other norm.funs, e.g. from limma - - # Apply log2 and remove NAs as these will otherwise mess with some of the following modules - - proteinGroups.normalized\$tab <- na.omit(log2(proteinGroups.normalized\$tab)) - - png(paste0(output_prefix, '.proteus.', normfun, '_normalised_distributions.png'), width = 5*300, height = 5*300, res = 300, pointsize = 8) - print( - plotSampleDistributions(proteinGroups.normalized, title=paste0("Sample distributions after applying\n", normfun), fill="condition", method=opt\$plotSampleDistributions_method) - + scale_fill_brewer(palette=opt\$palette_name, name=opt\$contrast_variable) - + theme(plot.title = element_text(size = 12)) - ) - dev.off() - - png(paste0(output_prefix, '.proteus.', normfun, '_normalised_mean_variance_relationship.png'), width = 5*300, height = 5*300, res = 300, pointsize = 8) - print( - plotMV(proteinGroups.normalized, with.loess=opt\$plotMV_loess) - + ggtitle(paste0("Sample mean variance relationship after applying\n", normfun)) - + scale_fill_distiller(palette=opt\$palette_name) - + theme(plot.title = element_text(size = 12)) - ) - dev.off() - - png(paste0(output_prefix, '.proteus.', normfun, '_normalised_dendrogram.png'), width = 5*300, height = 5*300, res = 300, pointsize = 8) - print( - plotClustering(proteinGroups.normalized) - + ggtitle(paste0("Sample clustering after applying\n", normfun)) - + theme(plot.title = element_text(size = 12)) - ) - dev.off() - - # R object for other processes to use - - saveRDS(proteinGroups.normalized, file = paste0(output_prefix, '.proteus.', normfun, 'normalised_proteingroups.rds')) - - # Write normalized intensities matrix - - out_df <- data.frame( - proteinGroups.normalized\$tab, - check.names = FALSE - ) - out_df[[opt\$protein_id_col]] <- rownames(proteinGroups.normalized\$tab) # proteus saves the IDs as rownames; make column from those - out_df <- out_df[c(opt\$protein_id_col, colnames(out_df)[colnames(out_df) != opt\$protein_id_col])] # move ID column to first position - write.table( - out_df, - file = paste(output_prefix, 'proteus', normfun, 'normalised_proteingroups_tab', 'tsv', sep = '.'), - col.names = TRUE, - row.names = FALSE, - sep = '\t', - quote = FALSE - ) -} - -# Process and save raw table - -proteinGroups\$tab <- na.omit(log2(proteinGroups\$tab)) - -# Generate raw distribution plot - -png(paste0(output_prefix, '.proteus.raw_distributions.png'), width = 5*300, height = 5*300, res = 300, pointsize = 8) -print( - plotSampleDistributions(proteinGroups, title="Raw sample distributions", fill="condition", method=opt\$plotSampleDistributions_method) - + scale_fill_brewer(palette=opt\$palette_name, name=opt\$contrast_variable) - + theme(plot.title = element_text(size = 12)) - ) -dev.off() - -# R object for other processes to use - -saveRDS(proteinGroups, file = paste0(output_prefix, '.proteus.raw_proteingroups.rds')) - -# Write raw intensities matrix - -out_df <- data.frame( - proteinGroups\$tab, - check.names = FALSE - ) -out_df[[opt\$protein_id_col]] <- rownames(proteinGroups\$tab) # proteus saves the IDs as rownames; make column from those -out_df <- out_df[c(opt\$protein_id_col, colnames(out_df)[colnames(out_df) != opt\$protein_id_col])] # move ID column to first position - - -write.table( - out_df, - file = paste(output_prefix, 'proteus', 'raw_proteingroups_tab', 'tsv', sep = '.'), - col.names = TRUE, - row.names = FALSE, - sep = '\t', - quote = FALSE -) - -################################################ -################################################ -## R SESSION INFO ## -################################################ -################################################ - -sink("R_sessionInfo.log") -print(sessionInfo()) -sink() - -################################################ -################################################ -## VERSIONS FILE ## -################################################ -################################################ - -r.version <- strsplit(version[['version.string']], ' ')[[1]][3] -limma.version <- as.character(packageVersion('limma')) -plotly.version <- as.character(packageVersion('plotly')) -proteus.version <- as.character(packageVersion('proteus')) -writeLines( - c( - '"${task.process}":', - paste(' r-base:', r.version), - paste(' bioconductor-limma:', limma.version), - paste(' r-plotly:', plotly.version), - paste(' r-proteus-bartongroup:', proteus.version) - ), -'versions.yml') -################################################ -################################################ -################################################ -################################################ \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index 7e27beca..a85c4692 100644 --- a/nextflow.config +++ b/nextflow.config @@ -44,7 +44,7 @@ params { affy_cel_files_archive = null affy_file_name_col = 'file' affy_background = true - affy_bgversion = 2 + affy_bgversion = 2 affy_destructive = false affy_cdfname = null affy_rm_mask = false @@ -52,8 +52,13 @@ params { affy_rm_extra = false affy_build_annotation = true - // MaxQuant-specific options - maxquant_measurecol_prefix = 'LFQ intensity ' + // Proteus-specific options + proteus_measurecol_prefix = 'LFQ intensity ' + proteus_norm_functions = 'normalizeMedian' + proteus_plotSD_method = 'violin' + proteus_plotMV_loess = true + proteus_palette_name = 'Set1' + proteus_round_digits = -1 // Filtering options filtering_min_samples = 1 @@ -309,6 +314,7 @@ profiles { affy { includeConfig 'conf/affy.config' } rnaseq { includeConfig 'conf/rnaseq.config' } test_affy { includeConfig 'conf/test_affy.config' } + test_maxquant { includeConfig 'conf/test_maxquant.config' } } // Set default registry for Docker and Podman independent of -profile diff --git a/nextflow_schema.json b/nextflow_schema.json index 1b7c0c39..02cfc4b6 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -24,7 +24,7 @@ "default": "rnaseq", "description": "A string identifying the technology used to produce the data", "help_text": "Currently 'rnaseq' or 'affy_array' may be specified.", - "enum": ["rnaseq", "affy_array", "Px"], + "enum": ["rnaseq", "affy_array", "maxquant"], "fa_icon": "far fa-keyboard" }, "input": { @@ -230,6 +230,49 @@ }, "fa_icon": "fas fa-table" }, + "proteus_input_options": { + "title": "Proteus input options", + "type": "object", + "description": "Options for processing of proteomics MaxQuant tables with the Proteus R package", + "default": "", + "properties": { + "proteus_measurecol_prefix": { + "type": "string", + "default": "LFQ intensity ", + "description": "Prefix of the column names of the MaxQuant proteingroups table in which the intensity values are saved; the prefix has to be followed by the sample names that are also found in the samplesheet. Default: 'LFQ intensity '; take care to also consider trailing whitespace between prefix and samplenames." + }, + "proteus_norm_functions": { + "type": "string", + "default": "normalizeMedian", + "description": "Comma-separated string of normalization functions to use on the MaxQuant intensities.", + "help_text": "'normalizeMedian', 'normalizeQuantiles' or any comma-separated combination thereof" + }, + "proteus_plotSD_method": { + "type": "string", + "default": "violin", + "description": "Which method to use for plotting sample distributions of the MaxQuant intensities.", + "help_text": "'violin', 'dist' or 'box'", + "enum": ["violin", "dist", "box"] + }, + "proteus_plotMV_loess": { + "type": "boolean", + "default": true, + "description": "Should a loess line be added to the plot of mean-variance relationship of the conditions? Default: true." + }, + "proteus_palette_name": { + "type": "string", + "default": "Set1", + "help_text": "Check the content of `RColorBrewer::brewer.pal.info` from an R terminal for valid palette names.", + "description": "Valid R palette name", + "fa_icon": "fas fa-palette" + }, + "proteus_round_digits": { + "type": "number", + "default": -1, + "description": "Number of decimals to round the MaxQuant intensities to; default: -1 (will not round)." + } + } + }, "filtering": { "title": "Filtering", "type": "object", @@ -1095,6 +1138,9 @@ { "$ref": "#/definitions/affy_input_options" }, + { + "$ref": "#/definitions/proteus_input_options" + }, { "$ref": "#/definitions/filtering" }, diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index 7c86dbdc..70851c9f 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -22,7 +22,7 @@ if (params.study_type == 'affy_array'){ } else { error("CEL files archive not specified!") } -} else if (params.study_type == 'Px') { +} else if (params.study_type == 'maxquant') { proteus_in = Channel.of([ exp_meta, file(params.input), file(params.matrix) ]) } else { // If this is not an affy array or maxquant output, assume we're reading from a matrix @@ -92,8 +92,8 @@ include { CUSTOM_TABULARTOGSEACLS } from '../modules/n include { RMARKDOWNNOTEBOOK } from '../modules/nf-core/rmarkdownnotebook/main' include { AFFY_JUSTRMA as AFFY_JUSTRMA_RAW } from '../modules/nf-core/affy/justrma/main' include { AFFY_JUSTRMA as AFFY_JUSTRMA_NORM } from '../modules/nf-core/affy/justrma/main' -include { PROTEUS } from '../modules/nf-core/proteus/main' - +include { PROTEUS_READPROTEINGROUPS as PROTEUS } from '../modules/nf-core/proteus/readproteingroups/main' + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RUN MAIN WORKFLOW @@ -138,12 +138,25 @@ workflow DIFFERENTIALABUNDANCE { ch_in_norm = AFFY_JUSTRMA_NORM.out.expression ch_affy_platform_features = AFFY_JUSTRMA_RAW.out.annotation - } else if (params.study_type == 'Px') { - PROTEUS(proteus_in) - ch_in_raw = PROTEUS.out.tab - ch_in_norm = PROTEUS.out.normtab - } + } else if (params.study_type == 'maxquant') { + + ch_contrasts_proteus = Channel.from(file(params.contrasts)) + .splitCsv(header:true, sep:',') + .map{ + tuple( + exp_meta, // meta map + it.variable // contrast variable + ) + } + + PROTEUS( + proteus_in, + ch_contrasts_proteus + ) + ch_in_raw = PROTEUS.out.raw_tab + ch_in_norm = PROTEUS.out.norm_tab + } //// Fetch or derive a feature annotation table @@ -179,15 +192,13 @@ workflow DIFFERENTIALABUNDANCE { .mix(GTF_TO_TABLE.out.versions) } else{ - if (params.study_type == 'Px'){ - ch_features = PROTEUS.out.normtab.map{ + if (params.study_type == 'maxquant'){ + ch_features = PROTEUS.out.norm_tab.map{ matrix_as_anno_filename = "matrix_as_anno.${it[1].getExtension()}" it[1].copyTo(matrix_as_anno_filename) it[1] = file(matrix_as_anno_filename) it - }.dump(tag:'waaaaa') - - //ch_features = PROTEUS.out.normtab2 //Channel.of([ exp_meta, file(matrix_as_anno_filename)]) + } } else { @@ -208,20 +219,17 @@ workflow DIFFERENTIALABUNDANCE { ch_matrices_for_validation = ch_in_raw .join(ch_in_norm) .map{tuple(it[0], [it[1], it[2]])} - } else if (params.study_type == 'Px') { + } else if (params.study_type == 'maxquant') { ch_matrices_for_validation = ch_in_raw .join(ch_in_norm) - .dump(tag:'matval_px1') .map{tuple(it[0], [it[1], it[2]])} - .dump(tag:'matval_px') } else{ ch_matrices_for_validation = ch_in_raw } - print("häää") - print(params.observations_id_col) + VALIDATOR( - ch_input.join(ch_matrices_for_validation).dump(tag:'val_input'), + ch_input.join(ch_matrices_for_validation), ch_features, ch_contrasts_file ) @@ -229,18 +237,16 @@ workflow DIFFERENTIALABUNDANCE { // For Affy, we've validated multiple input matrices for raw and norm, // we'll separate them out again here - if (params.study_type == 'affy_array' || params.study_type == 'Px'){ + if (params.study_type == 'affy_array' || params.study_type == 'maxquant'){ ch_validated_assays = VALIDATOR.out.assays .transpose() .branch { raw: it[1].name.contains('raw') - normalised: it[1].name.contains('normalised') + normalised: it[1].name.contains('normalised') || it[1].name.contains('normalized') } ch_raw = ch_validated_assays.raw ch_norm = ch_validated_assays.normalised ch_matrix_for_differential = ch_norm - VALIDATOR.out.assays.dump(tag:'valassay') - ch_norm.dump(tag:'ch_norm') } else{ ch_raw = VALIDATOR.out.assays @@ -262,29 +268,27 @@ workflow DIFFERENTIALABUNDANCE { tuple(it, it.variable, it.reference, it.target) } - if (params.study_type == 'Px') { + if (params.study_type == 'maxquant') { ch_samples_and_matrix = VALIDATOR.out.sample_meta .join(ch_matrix_for_differential) // -> meta, samplesheet, unfiltered matrix .first() + } else { // Firstly Filter the input matrix - ch_matrix_for_differential.dump(tag:'differentialmat') - VALIDATOR.out.sample_meta.dump(tag:'val_out_samplemeta') + ch_matrix_for_differential + VALIDATOR.out.sample_meta CUSTOM_MATRIXFILTER( ch_matrix_for_differential, VALIDATOR.out.sample_meta ) - // Prepare inputs for differential processes ch_samples_and_matrix = VALIDATOR.out.sample_meta .join(CUSTOM_MATRIXFILTER.out.filtered) // -> meta, samplesheet, filtered matrix .first() } - - if (params.study_type == 'affy_array' || params.study_type == 'Px'){ - + if (params.study_type == 'affy_array' || params.study_type == 'maxquant'){ LIMMA_DIFFERENTIAL ( ch_contrasts, ch_samples_and_matrix @@ -409,7 +413,7 @@ workflow DIFFERENTIALABUNDANCE { ch_contrast_variables .combine(ch_all_matrices.map{ it.tail() }) -/* + PLOT_EXPLORATORY( ch_contrast_variables .combine(ch_all_matrices.map{ it.tail() }) @@ -428,7 +432,7 @@ workflow DIFFERENTIALABUNDANCE { .mix(VALIDATOR.out.versions) .mix(PLOT_EXPLORATORY.out.versions) .mix(PLOT_DIFFERENTIAL.out.versions) -*/ + CUSTOM_DUMPSOFTWAREVERSIONS ( ch_versions.unique().collectFile(name: 'collated_versions.yml') ) @@ -477,8 +481,8 @@ workflow DIFFERENTIALABUNDANCE { .combine(ch_differential.map{it[1]}.collect().map{[it]}) SHINYNGS_APP( - ch_all_matrices.dump(tag:'shiny1'), // meta, samples, features, [ matrices ] - ch_app_differential.dump(tag:'shiny2'), // meta, contrasts, [differential results] + ch_all_matrices, // meta, samples, features, [ matrices ] + ch_app_differential, // meta, contrasts, [differential results] params.exploratory_assay_names.split(',').findIndexOf { it == params.exploratory_final_assay } + 1 ) ch_versions = ch_versions.mix(SHINYNGS_APP.out.versions) @@ -494,7 +498,7 @@ workflow DIFFERENTIALABUNDANCE { // Condition params reported on study type def params_pattern = ~/^(report|study|observations|features|filtering|exploratory|differential|deseq2|gsea).*/ - if (params.study_type == 'affy_array' || params.study_type == 'Px'){ + if (params.study_type == 'affy_array' || params.study_type == 'maxquant'){ params_pattern = ~/^(report|study|observations|features|filtering|exploratory|differential|affy|limma|gsea).*/ } @@ -505,7 +509,7 @@ workflow DIFFERENTIALABUNDANCE { } // Render the final report - ch_report_params.dump(tag:'ch_report_params') + ch_report_params RMARKDOWNNOTEBOOK( ch_report_file, ch_report_params, From 80f5ad07a40e8afed9ae69e6f0d4220d822332cb Mon Sep 17 00:00:00 2001 From: WackerO Date: Tue, 18 Jul 2023 09:39:19 +0200 Subject: [PATCH 07/30] prettier, removed an unnecessary file --- conf/test_maxquant.config | 59 +++++++++++++++++++++++ modules/nf-core/rmarkdownnotebook/main.nf | 4 +- modules/nf-core/shinyngs/app/main.nf | 10 +--- nextflow_schema.json | 2 +- pxnotebook_env.yml | 21 -------- 5 files changed, 63 insertions(+), 33 deletions(-) create mode 100644 conf/test_maxquant.config delete mode 100644 pxnotebook_env.yml diff --git a/conf/test_maxquant.config b/conf/test_maxquant.config new file mode 100644 index 00000000..78112489 --- /dev/null +++ b/conf/test_maxquant.config @@ -0,0 +1,59 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple + pipeline test with MaxQuant Mass-spec data. + + Use as follows: + nextflow run nf-core/differentialabundance -profile test_maxquant, --outdir + +---------------------------------------------------------------------------------------- +*/ + +includeConfig 'maxquant.config' + +params { + study_name = 'PXD043349' + config_profile_name = 'MaxQuant test profile' + config_profile_description = 'MaxQuant test dataset to check pipeline function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/proteomics/maxquant/MaxQuant_samplesheet.tsv' + matrix = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/proteomics/maxquant/MaxQuant_proteinGroups.txt' + contrasts = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/proteomics/maxquant/MaxQuant_contrasts.csv' + + // Features + features_id_col = 'Majority protein IDs' + features_metadata_cols = "Majority protein IDs" + + // Observations + observations_id_col = 'Experiment' + observations_name_col = 'Name' + maxquant_measurecol_prefix = 'LFQ intensity ' + + // Exploratory + exploratory_main_variable = 'Celltype' + exploratory_assay_names = 'raw,normalised' + exploratory_final_assay = 'normalised' + + // Differential + differential_feature_id_column = 'probe_id' + differential_fc_column = 'logFC' + differential_qval_column = 'adj.P.Val' +} + +// This is necessary so that some of the parameters changed above are actually accepted by the pipeline (unless using -params-file test_maxquant.yml) +process { + withName: VALIDATOR { + publishDir = [ + enabled: false + ] + ext.args = "--sample_id_col '${params.observations_id_col}' --feature_id_col '${params.features_id_col}'" + } +} \ No newline at end of file diff --git a/modules/nf-core/rmarkdownnotebook/main.nf b/modules/nf-core/rmarkdownnotebook/main.nf index ec8f21b1..45ed550b 100644 --- a/modules/nf-core/rmarkdownnotebook/main.nf +++ b/modules/nf-core/rmarkdownnotebook/main.nf @@ -7,10 +7,10 @@ process RMARKDOWNNOTEBOOK { //NB: You likely want to override this with a container containing all required //dependencies for your analysis. The container at least needs to contain the //yaml and rmarkdown R packages. - conda "conda-forge::r-base=4.1.0 conda-forge::r-rmarkdown=2.9 conda-forge::r-yaml=2.2.1 anaconda::gmp=6.2.1 conda-forge::r-ggplot2=3.4.2" + conda "conda-forge::r-base=4.1.0 conda-forge::r-rmarkdown=2.9 conda-forge::r-yaml=2.2.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/mulled-v2-31ad840d814d356e5f98030a4ee308a16db64ec5:0e852a1e4063fdcbe3f254ac2c7469747a60e361-0' : - 'docker.io/library/pxnotebook_local' }" + 'biocontainers/mulled-v2-31ad840d814d356e5f98030a4ee308a16db64ec5:0e852a1e4063fdcbe3f254ac2c7469747a60e361-0' }" input: tuple val(meta), path(notebook) diff --git a/modules/nf-core/shinyngs/app/main.nf b/modules/nf-core/shinyngs/app/main.nf index d7c03291..7d601105 100644 --- a/modules/nf-core/shinyngs/app/main.nf +++ b/modules/nf-core/shinyngs/app/main.nf @@ -40,7 +40,7 @@ process SHINYNGS_APP { make_app_from_files.R \\ --sample_metadata $sample \\ --feature_metadata $feature_meta \\ - --assay_files "/home/iivow01/git/differentialabundance/results_Px_noNA/proteus/output_prefix.proteus.raw_proteingroups_tab.tsv,/home/iivow01/git/differentialabundance/results_Px_noNA/proteus/output_prefix.proteus.normalizeMedian.normalised_proteingroups_tab.tsv" \\ + --assay_files ${assay_files.join(',')} \\ --contrast_file $contrasts \\ --contrast_stats_assay $contrast_stats_assay \\ --differential_results ${differential_results.join(',')} \\ @@ -54,11 +54,3 @@ process SHINYNGS_APP { END_VERSIONS """ } - -// --assay_files ${assay_files.join(',')} \\ - -// --differential_results ${differential_results.join(',')} \\ - -// --assay_files "/home/iivow01/git/differentialabundance/results_Px_noNA/proteus/output_prefix.proteus.raw_proteingroups_tab.tsv,/home/iivow01/git/differentialabundance/results_Px_noNA/proteus/output_prefix.proteus.normalizeMedian.normalised_proteingroups_tab.tsv" \\ - -// --differential_results "/home/iivow01/git/differentialabundance/results_Px_noNA/tables/differential/Condition__genotype-WT-NFAT1_plus_2_minus_KO.limma.results.tsv,/home/iivow01/git/differentialabundance/results_Px_noNA/tables/differential/Condition__genotype-WT-NFAT1_minus_KO.limma.results.tsv,/home/iivow01/git/differentialabundance/results_Px_noNA/tables/differential/Condition__genotype-WT-NFAT2_minus_KO.limma.results.tsv" \\ diff --git a/nextflow_schema.json b/nextflow_schema.json index 02cfc4b6..9ceaa522 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -244,7 +244,7 @@ "proteus_norm_functions": { "type": "string", "default": "normalizeMedian", - "description": "Comma-separated string of normalization functions to use on the MaxQuant intensities.", + "description": "Comma-separated string of normalization functions to use on the MaxQuant intensities.", "help_text": "'normalizeMedian', 'normalizeQuantiles' or any comma-separated combination thereof" }, "proteus_plotSD_method": { diff --git a/pxnotebook_env.yml b/pxnotebook_env.yml deleted file mode 100644 index 8049c896..00000000 --- a/pxnotebook_env.yml +++ /dev/null @@ -1,21 +0,0 @@ -# You can use this file to create a conda environment for this pipeline: -# conda env create -f environment.yml -# use this to find packages: https://anaconda.org/ -name: pxnotebook -channels: - - bioconda - - conda-forge - - anaconda -dependencies: - - anaconda::gmp=6.2.1 - - bioconda::r-shinyngs=1.7.2 - - conda-forge::r-base=4.2.3 - - conda-forge::r-dplyr=1.1.2 - - conda-forge::r-dt=0.28 - - conda-forge::r-knitr=1.43 - - conda-forge::r-plotly=4.10.1 - - conda-forge::r-rmarkdown=2.21 - - conda-forge::r-tidyverse=2.0.0  - - conda-forge::r-yaml=2.3.7 - - conda-forge::r-ggplot2=3.4.2 - - conda-forge::r-upsetr=1.4.0 \ No newline at end of file From e4395bd0b0fac9e12c437f2e88c69f1250a6c063 Mon Sep 17 00:00:00 2001 From: WackerO Date: Tue, 18 Jul 2023 09:40:56 +0200 Subject: [PATCH 08/30] Added missing config --- conf/maxquant.config | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 conf/maxquant.config diff --git a/conf/maxquant.config b/conf/maxquant.config new file mode 100644 index 00000000..7da526fb --- /dev/null +++ b/conf/maxquant.config @@ -0,0 +1,42 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running MaxQuant proteomics analysis +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines settings specific to MaxQuant proteomics analysis + + Use as follows: + nextflow run nf-core/differentialabundance -profile maxquant, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + + config_profile_name = 'MaxQuant profile' + config_profile_description = 'Settings for MaxQuant analysis' + + // Study + study_type = 'maxquant' + study_abundance_type = 'intensities' + + // Features + features_id_col = 'Majority protein IDs' + features_name_col = 'Majority protein IDs' + features_metadata_cols = 'Majority protein IDs' + features_type = 'protein' + + // Exploratory + exploratory_assay_names = "raw,normalised,variance_stabilised" + exploratory_final_assay = "variance_stabilised" + + // Differential options + differential_file_suffix = ".limma.results.tsv" + differential_fc_column = "logFC" + differential_pval_column = "P.Value" + differential_qval_column = "adj.P.Val" + differential_feature_id_column = "probe_id" + differential_feature_name_column = "probe_id" + + // Shiny does not work for this datatype + shinyngs_build_app = false +} From 27a3123c0d66c349479c4d9efe851b1f1970c3e2 Mon Sep 17 00:00:00 2001 From: WackerO Date: Wed, 19 Jul 2023 08:42:25 +0200 Subject: [PATCH 09/30] linting --- conf/modules.config | 2 +- conf/test_maxquant.config | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 09bc1601..66b2245c 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -100,7 +100,7 @@ process { "--build_annotation ${params.affy_build_annotation}" ].join(' ').trim() } } - + withName: PROTEUS { publishDir = [ [ diff --git a/conf/test_maxquant.config b/conf/test_maxquant.config index 78112489..38eed000 100644 --- a/conf/test_maxquant.config +++ b/conf/test_maxquant.config @@ -31,7 +31,7 @@ params { // Features features_id_col = 'Majority protein IDs' features_metadata_cols = "Majority protein IDs" - + // Observations observations_id_col = 'Experiment' observations_name_col = 'Name' @@ -56,4 +56,4 @@ process { ] ext.args = "--sample_id_col '${params.observations_id_col}' --feature_id_col '${params.features_id_col}'" } -} \ No newline at end of file +} From 7013352a0f987f0cde631d8b447708a854c5d3ab Mon Sep 17 00:00:00 2001 From: WackerO Date: Wed, 19 Jul 2023 11:09:43 +0200 Subject: [PATCH 10/30] Undid some accidental changes --- conf/modules.config | 2 +- docs/output.md | 60 ++++++++++++++++++- .../limma/differential/templates/limma_de.R | 2 +- nextflow.config | 4 ++ nextflow_schema.json | 12 ++++ 5 files changed, 75 insertions(+), 5 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 66b2245c..0d77822a 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -32,7 +32,7 @@ process { pattern: '*.anno.tsv' ] ] - ext.args = "--feature-type transcript" + ext.args = "--feature-type '${params.features_gtf_feature_type}' --first-field '${params.features_gtf_table_first_field}'" } diff --git a/docs/output.md b/docs/output.md index 4bf07ea8..f93e90be 100644 --- a/docs/output.md +++ b/docs/output.md @@ -68,15 +68,17 @@ The `differential` folder is likely to be the core result set for most users, co ## Shiny app +
+Output files + - `shinyngs_app/` - `[study name]`: - `data.rds`: serialized R object which can be used to generate a Shiny application - `app.R`: minimal R script that will source the data object and generate the app -The app must be run in an environment with [ShinyNGS](https://github.com/pinin4fjords/shinyngs) installed, or you can see the workflow parameters to deploy to shinyapps.io (see usage documentation). +
-
-Output files +The app must be run in an environment with [ShinyNGS](https://github.com/pinin4fjords/shinyngs) installed, or you can see the workflow parameters to deploy to shinyapps.io (see usage documentation). ### Pipeline information @@ -91,3 +93,55 @@ The app must be run in an environment with [ShinyNGS](https://github.com/pinin4f
[Nextflow](https://www.nextflow.io/docs/latest/tracing.html) provides excellent functionality for generating various reports relevant to the running and execution of the pipeline. This will allow you to troubleshoot errors with the running of the pipeline, and also provide you with other information such as launch commands, run times and resource usage. + +## Frequently asked questions + +### Why are no genes flagged as differentially expressed? + +#### 1. Low replication: + +**Problem:** The number of replicates in your RNA-seq experiment may be insufficient to detect statistically significant differential expression. + +**Suggested course of action:** Consider increasing the number of replicates to improve the statistical power of your analysis. Repeating the experiment with greater replication allows for better estimation of biological variation and increases the chances of observing significant differential expression. Consult with experimental design experts or statisticians to determine the appropriate sample size calculation based on your specific research question and resources. + +#### 2. Subtle effect: + +**Problem:** The experimental intervention may have a relatively subtle impact on gene expression, making it challenging to detect differential expression using default thresholds. + +**Suggested course of action:** Adjust the analysis parameters to improve sensitivity in capturing smaller changes in gene expression. Try reducing the `differential_min_fold_change` parameter to include genes with smaller fold changes. Additionally, consider increasing the `differential_max_qval` parameter to relax the significance threshold and capture a broader range of significant p-values or q-values. By fine-tuning these parameters, you increase the likelihood of identifying genes with subtle but biologically relevant changes in expression. + +#### 3. Genuinely no differential expression: + +**Problem:** It is possible that the experimental intervention has not significantly impacted gene expression, resulting in the absence of differentially expressed genes. + +**Suggested course of action:** Evaluate the experimental design and the perturbation itself. If the intervention is expected to induce changes in gene expression but no differential expression is observed, revisit the experimental design, biological perturbation, or underlying hypothesis. Consider reassessing the experimental conditions or exploring alternative approaches to investigate other aspects of the biological system. + +#### 4. Unaccounted sources of variance: + +**Problem:** Other factors outside the main treatment may introduce variance in gene expression, leading to a decrease in power to detect differential expression. + +**Suggested course of action:** Examine the PCA (Principal Component Analysis) and metadata association plots generated by the workflow. Identify variables associated with components that contribute significantly to the variance in your data. Include these variables as covariates in the contrasts table's blocking column to account for their effects on gene expression. By incorporating these unaccounted sources of variance into your analysis, you improve the accuracy and power to detect differential expression. + +#### 5. Biological complexity and pathway-level effects: + +**Problem:** The experimental intervention may not lead to observable differential expression at the individual gene level, but there may be coordinated changes at the pathway or functional level. + +**Suggested course of action:** Utilize pathway analysis tools such as Gene Set Enrichment Analysis (GSEA), available in this workflow. These tools evaluate the enrichment of gene sets or functional annotations to identify broader biological processes influenced by the experimental intervention. By focusing on pathway-level analysis, you can capture the overall impact of the intervention on biological processes, even if differential expression at the individual gene level is not apparent. + +#### 6. Limited options for normalization: + +**Problem:** The nf-core differential abundance workflow currently offers a limited set of normalization methods, which may not fully address the specific normalization requirements of your experiment. + +**Suggested course of action:** If the existing options do not adequately address your experiment's normalization challenges, consider developing custom normalization modules tailored to your needs. By contributing these modules to the nf-core community, you can expand the range of normalization options available to researchers. Your contributions will help researchers in similar situations and contribute to the continuous improvement and customization of the workflow. + +#### 7. Technical variability and batch effects: + +**Problem:** Technical variability and batch effects can introduce noise and confound the detection of differential expression. + +**Suggested course of action:** Address technical variability and batch effects in the experimental design and data analysis. Randomize sample collection, incorporate control samples, and balance samples across different experimental batches. These measures minimize technical variation, enhance the robustness of the analysis, and increase the chances of detecting true differential expression. + +#### 8. Workflow issues or bugs: + +**Problem:** Potential issues or bugs in the nf-core differential abundance workflow can affect the detection of differential expression or data analysis. + +**Suggested course of action:** Report any issues or suspected bugs by opening an issue on the [nf-core differential abundance workflow repository](https://github.com/nf-core/differentialabundance). Provide specific details, such as software versions, error messages, and relevant data or code snippets. Your feedback is valuable for improving the workflow's reliability. If you have the technical expertise, consider contributing to the workflow by submitting pull requests to address issues, fix bugs, or propose enhancements. diff --git a/modules/nf-core/limma/differential/templates/limma_de.R b/modules/nf-core/limma/differential/templates/limma_de.R index c6116928..47d0424f 100755 --- a/modules/nf-core/limma/differential/templates/limma_de.R +++ b/modules/nf-core/limma/differential/templates/limma_de.R @@ -155,8 +155,8 @@ intensities.table <- row.names = opt\$probe_id_col, check.names = FALSE ) - sample.sheet <- read_delim_flexible(file = opt\$sample_file) + # Deal with spaces that may be in sample column opt\$sample_id_col <- make.names(opt\$sample_id_col) diff --git a/nextflow.config b/nextflow.config index 160402f9..2406f16a 100644 --- a/nextflow.config +++ b/nextflow.config @@ -40,6 +40,10 @@ params { features_name_col = 'gene_name' features_metadata_cols = 'gene_id,gene_name,gene_biotype' + // GTF parsing options + features_gtf_feature_type = 'transcript' + features_gtf_table_first_field = 'gene_id' + // Affy-specific options affy_cel_files_archive = null affy_file_name_col = 'file' diff --git a/nextflow_schema.json b/nextflow_schema.json index 9ceaa522..4acd0035 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -165,6 +165,18 @@ "description": "This parameter allows you to supply your own feature annotations. These can often be automatically derived from the GTF used upstream for RNA-seq, or from the Bioconductor annotation package (for affy arrays). ", "help_text": "This parameter allows you to supply your own feature annotations. These can often be automatically derived from the GTF used upstream for RNA-seq, or from the Bioconductor annotation package (for affy arrays). ", "fa_icon": "fas fa-align-justify" + }, + "features_gtf_feature_type": { + "type": "string", + "default": "transcript", + "description": "Where a GTF file is supplied, which feature type to use", + "fa_icon": "fas fa-keyboard" + }, + "features_gtf_table_first_field": { + "type": "string", + "default": "gene_id", + "description": "Where a GTF file is supplied, which field should go first in the converted output table", + "fa_icon": "fas fa-fast-backward" } }, "required": ["features_id_col", "features_name_col", "features_type"], From abce801a3a0ef8f5ae0fb6f6b9a1e455e405c12c Mon Sep 17 00:00:00 2001 From: WackerO Date: Thu, 10 Aug 2023 11:28:42 +0200 Subject: [PATCH 11/30] Updated docs, fixed bugs with proteus integration, separated more clearly the workflow parts necessary for proteus from those not necessary --- assets/differentialabundance_report.Rmd | 63 ++++- conf/modules.config | 4 +- conf/test_maxquant.config | 7 +- docs/usage.md | 18 +- workflows/differentialabundance.nf | 338 +++++++++++++----------- 5 files changed, 261 insertions(+), 169 deletions(-) diff --git a/assets/differentialabundance_report.Rmd b/assets/differentialabundance_report.Rmd index cde17a92..f0184ce9 100644 --- a/assets/differentialabundance_report.Rmd +++ b/assets/differentialabundance_report.Rmd @@ -71,6 +71,7 @@ params: exploratory_mad_threshold: null exploratory_main_variable: null exploratory_assay_names: NULL + exploratory_assay_log2: NULL exploratory_final_assay: NULL exploratory_palette_name: NULL versions_file: null # e.g 17_software_versions.yml @@ -231,21 +232,67 @@ informative_variables <- informative_variables[ ! duplicated(lapply(structure(in assay_names <- simpleSplit(params$exploratory_assay_names) names(assay_names) = assay_names assay_files <- lapply(assay_names, function(x) params[[paste0(x, '_matrix')]]) +capture.output(assay_files, file="/home-link/iivow01/git/differentialabundance/error/assay_files") +capture.output(names(assay_files), file="/home-link/iivow01/git/differentialabundance/error/assay_files_names") +capture.output(assay_names, file="/home-link/iivow01/git/differentialabundance/error/assay_names") + +assay_counter <- 0 # Counter to keep track of which assay is currently being processed + +# If set, prepare exploratory_assay_log2 before lapply +if (! is.null(params$exploratory_assay_log2)) { + if (is_valid_positive_integer_vector(params$exploratory_assay_log2)) { + unlogged <- unique(as.integer(simpleSplit(params$exploratory_assay_log2))) + invalid_assays <- unlogged[! unlogged %in% 1:length(assay_files)] + if (length(invalid_assays) > 0){ + stop(paste0("Invalid assay numbers: ", paste(invalid_assays, collapse=', '))) + } + } + + else { + unlogged <- unique(simpleSplit(params$exploratory_assay_log2)) + + # Check if all names are valid + if (any(!(unlogged %in% names(assay_files)))) { + invalid_assays <- paste(unlogged[!(unlogged %in% names(assay_files))], collapse=", ") + stop(paste0(invalid_assays, " is/are not valid assay name(s). Valid assay names are: ", paste(names(assay_files), collapse=", "), ". Please check param --exploratory_assay_log2.")) + } + + for (unlogged_expression_type in unlogged) { + assay_data[[unlogged_expression_type]] <- log2(assay_data[[unlogged_expression_type]]) + } + } +} assay_data <- lapply(assay_files, function(x) { - mat <- read_matrix( - x, - sample_metadata = observations, - row.names = 1 + + mat <- na.omit( + read_matrix( + x, + sample_metadata = observations, + row.names = 1 + ) ) colnames(mat) <- observations[[params$observations_name_col]][match(colnames(mat), rownames(observations))] - # Bit hacky, but ensure log - if (max(mat) > 20){ - log2(mat+1) - }else{ + # Determine if log2 should be guessed/applied to no assays/applied to certain assays + if (is.null(params$exploratory_assay_log2)) { + if (max(mat) > 20){ + log2(mat+1) + } + } else if (params$exploratory_assay_log2 == "") { + mat + } else if (is_valid_positive_integer_vector(params$exploratory_assay_log2)) { + + } + + + + else{ mat } + + + }) # Now we can rename the observations rows using the title field diff --git a/conf/modules.config b/conf/modules.config index 0d77822a..54579e01 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -330,8 +330,8 @@ process { } withName: RMARKDOWNNOTEBOOK { - conda = "bioconda::r-shinyngs=1.7.2" - container = { "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/r-shinyngs:1.7.2--r42hdfd78af_0' : 'quay.io/biocontainers/r-shinyngs:1.7.2--r42hdfd78af_0' }" } + conda = "bioconda::r-shinyngs=1.8.1" + container = { "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/r-shinyngs:1.8.1--r43hdfd78af_0' : 'quay.io/biocontainers/r-shinyngs:1.8.1--r43hdfd78af_0' }" } publishDir = [ path: { "${params.outdir}/report" }, mode: params.publish_dir_mode, diff --git a/conf/test_maxquant.config b/conf/test_maxquant.config index 38eed000..353a51f4 100644 --- a/conf/test_maxquant.config +++ b/conf/test_maxquant.config @@ -24,9 +24,10 @@ params { max_time = '6.h' // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/proteomics/maxquant/MaxQuant_samplesheet.tsv' - matrix = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/proteomics/maxquant/MaxQuant_proteinGroups.txt' - contrasts = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/proteomics/maxquant/MaxQuant_contrasts.csv' + //TODO + input = '/home-link/iivow01/git/differentialabundance/testdata/maxquant/MaxQuant_samplesheet.tsv' // 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/proteomics/maxquant/MaxQuant_samplesheet.tsv' + matrix = '/home-link/iivow01/git/differentialabundance/testdata/maxquant/MaxQuant_proteinGroups.txt' // 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/proteomics/maxquant/MaxQuant_proteinGroups.txt' + contrasts = '/home-link/iivow01/git/differentialabundance/testdata/maxquant/MaxQuant_contrasts.csv' // 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/proteomics/maxquant/MaxQuant_contrasts.csv' // Features features_id_col = 'Majority protein IDs' diff --git a/docs/usage.md b/docs/usage.md index b97a7fee..a1fe8958 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -11,10 +11,10 @@ Differential analysis is a common task in a variety of use cases. In essence, al With the above in mind, running this workflow requires: - a set of abundance values. This can be: - - (for RNA-seq): a matrix of quantifications with observations by column and features by row + - (for RNA-seq or MaxQuant proteomics measurements): a matrix of quantifications with observations by column and features by row - (for Affymetrix microarrays): a tar'd archive of CEL files - a description of the observations such as a sample sheet from RNA-seq analysis -- a description of the features, for our initial RNA-seq application this can be simply the GTF file from which gene annotations can be derived. For Affymetrix arrays this can be derived from the array platform annotation package automatically. You can also supply your own table. +- a description of the features (skip for MaxQuant), for our initial RNA-seq application this can be simply the GTF file from which gene annotations can be derived. For Affymetrix arrays this can be derived from the array platform annotation package automatically. You can also supply your own table. - a specification of how the matrix should be split, and how the resulting groups should be compared ## Observations (samplesheet) input @@ -49,6 +49,14 @@ The file can be tab or comma separated. This is a numeric square matrix file, comma or tab-separated, with a column for every observation, and features corresponding to the supplied feature set. The parameters `--observations_id_col` and `--features_id_col` define which of the associated fields should be matched in those inputs. +### MaxQuant intensities + +```bash +--matrix '[path to matrix file]' +``` + +This is the proteinGroups.txt file produced by MaxQuant. It is a tab-separated matrix file with a column for every observation (plus additional columns for other types of measurements and information); each row contains these data for a set of proteins. The parameters `--observations_id_col` and `--features_id_col` define which of the associated fields should be matched in those inputs. The parameter `--proteus_measurecol_prefix` defines which prefix is used to extract those matrix columns which contain the measurements to be used. For example, the default `LFQ intensity ` will indicate that columns like LFQ intensity S1, LFQ intensity S2, LFQ intensity S3 etc. are used (do not forget trailing whitespace in this parameter, if required!). + ### Affymetrix microarrays ```bash @@ -93,7 +101,7 @@ The file can be tab or comma separated. --gtf '[path to gtf file]' ``` -This is usually the easiest way to supply annotations for RNA-seq features. It should match the GTF used in nf-core/rnaseq if that workflow was used to produce the input expression matrix. +This is usually the easiest way to supply annotations for RNA-seq features. It should match the GTF used in nf-core/rnaseq if that workflow was used to produce the input expression matrix. Skip for MaxQuant. ### Annotation package identifiers for Affymetrix arrays @@ -107,11 +115,11 @@ To override the above options, you may also supply your own features table as a --features '[path to features TSV]' ``` -By default, if you don't provide features, for non-array data the workflow will fall back to attempting to use the matrix itself as a source of feature annotations. For this to work you must make sure to set the `features_id_col`, `features_name_col` and `features_metadata_cols` parameters to the appropriate values, for example by setting them to 'gene_id' if that is the identifier column on the matrix. This will cause the gene ID to be used everywhere rather than more accessible gene symbols (as can be derived from the GTF), but the workflow should run. +By default, if you don't provide features, for non-array data the workflow will fall back to attempting to use the matrix itself as a source of feature annotations. For this to work you must make sure to set the `features_id_col`, `features_name_col` and `features_metadata_cols` parameters to the appropriate values, for example by setting them to 'gene_id' if that is the identifier column on the matrix. This will cause the gene ID to be used everywhere rather than more accessible gene symbols (as can be derived from the GTF), but the workflow should run. Please use this option for MaxQuant analysis. ## Shiny app generation -The pipeline is capable of building, and even deploying (to [shinyapps.io](https://www.shinyapps.io/)) for you a Shiny app built with [ShinyNGS](https://github.com/pinin4fjords/shinyngs). +The pipeline is capable of building, and even deploying (to [shinyapps.io](https://www.shinyapps.io/)) for you a Shiny app built with [ShinyNGS](https://github.com/pinin4fjords/shinyngs) (disabled for MaxQuant). This is enabled with: diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index cc2eb54d..08650b2e 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -23,6 +23,16 @@ if (params.study_type == 'affy_array'){ error("CEL files archive not specified!") } } else if (params.study_type == 'maxquant') { + + // Should the user have enabled --shinyngs_build_app and/or --gsea_run, throw an error + if (params.shinyngs_build_app) { + error("Cannot build shinyngs app for maxquant data; please set --shinyngs_build_app to false.") + } + if (params.gsea_run) { + error("Cannot run GSEA for maxquant data; please set --gsea_run to false.") + } + + // Make channel for proteus proteus_in = Channel.of([ exp_meta, file(params.input), file(params.matrix) ]) } else { // If this is not an affy array or maxquant output, assume we're reading from a matrix @@ -108,184 +118,216 @@ workflow DIFFERENTIALABUNDANCE { // Set up some basic variables ch_versions = Channel.empty() - // If we have affy array data in the form of CEL files we'll be deriving - // matrix and annotation from them - - if (params.study_type == 'affy_array'){ - - // Uncompress the CEL files archive - - UNTAR ( ch_celfiles ) - - ch_affy_input = ch_input - .join(UNTAR.out.untar) - - // Run affy to derive the matrix. Reset the meta so it can be used to - // define a prefix for different matrix flavours - - AFFY_JUSTRMA_RAW ( - ch_affy_input, - [[],[]] - ) - AFFY_JUSTRMA_NORM ( - ch_affy_input, - [[],[]] - ) - - // Fetch affy outputs and reset the meta - - ch_in_raw = AFFY_JUSTRMA_RAW.out.expression - ch_in_norm = AFFY_JUSTRMA_NORM.out.expression + // Check first if maxquant, as in that case the entire VALIDATE block can be skipped + if (params.study_type == 'maxquant') { - ch_affy_platform_features = AFFY_JUSTRMA_RAW.out.annotation + // Save contrasts file to channel + ch_contrasts_file = Channel.from([[exp_meta, file(params.contrasts)]]) - } else if (params.study_type == 'maxquant') { + // Split contrasts for proteus and for the later modules + ch_contrasts_split = ch_contrasts_file + .splitCsv ( header:true, sep:(params.contrasts.endsWith('tsv') ? '\t' : ',')) + .map{ it.tail().first() } - ch_contrasts_proteus = Channel.from(file(params.contrasts)) - .splitCsv(header:true, sep:',') + // For proteus, extract only meta and contrast variable + ch_contrasts_proteus = ch_contrasts_split .map{ tuple( exp_meta, // meta map it.variable // contrast variable ) } - + + // For the plotting and following modules, save all contrast info + ch_contrasts = ch_contrasts_split + .map{ + it.blocking = it.blocking.replace('NA', '') + if (!it.id){ + it.id = it.values().join('_') + } + tuple(it, it.variable, it.reference, it.target) + } + PROTEUS( proteus_in, ch_contrasts_proteus ) - ch_in_raw = PROTEUS.out.raw_tab - ch_in_norm = PROTEUS.out.norm_tab - } + ch_raw = PROTEUS.out.raw_tab + ch_norm = PROTEUS.out.norm_tab + ch_features = PROTEUS.out.norm_tab.map{ + matrix_as_anno_filename = "matrix_as_anno.${it[1].getExtension()}" + it[1].copyTo(matrix_as_anno_filename) // copy normalized outfile to use as fake "annotation" + it[1] = file(matrix_as_anno_filename) + it + } + ch_versions = ch_versions.mix(PROTEUS.out.versions) + + // Filter the input matrix + + CUSTOM_MATRIXFILTER( + ch_norm, + ch_input + ) + + // Prepare inputs for differential processes + + ch_samples_and_matrix = ch_input + .join(CUSTOM_MATRIXFILTER.out.filtered) // -> meta, samplesheet, filtered matrix + .first() - //// Fetch or derive a feature annotation table + ch_processed_matrices = ch_norm + .map{ it.tail() } + .first() + + ch_all_matrices = ch_input // meta, samples + .join(ch_features) // meta, samples, features + .join(ch_raw) // meta, samples, features, raw matrix + .combine(ch_processed_matrices) // meta, samples, features, raw, norm, ... + .map{ + tuple(it[0], it[1], it[2], it[3..it.size()-1]) + } + .first() + + + } else { + + // If we have affy array data in the form of CEL files we'll be deriving + // matrix and annotation from them + if (params.study_type == 'affy_array'){ + + // Uncompress the CEL files archive + + UNTAR ( ch_celfiles ) + + ch_affy_input = ch_input + .join(UNTAR.out.untar) + + // Run affy to derive the matrix. Reset the meta so it can be used to + // define a prefix for different matrix flavours + + AFFY_JUSTRMA_RAW ( + ch_affy_input, + [[],[]] + ) + AFFY_JUSTRMA_NORM ( + ch_affy_input, + [[],[]] + ) - // If user has provided a feature annotation table, use that - if (params.features){ - ch_features = Channel.of([ exp_meta, file(params.features, checkIfExists: true)]) - } else if (params.study_type == 'affy_array'){ - ch_features = ch_affy_platform_features - } else if (params.gtf){ - // Get feature annotations from a GTF file, gunzip if necessary + // Fetch affy outputs and reset the meta - file_gtf_in = file(params.gtf) - file_gtf = [ [ "id": file_gtf_in.simpleName ], file_gtf_in ] + ch_in_raw = AFFY_JUSTRMA_RAW.out.expression + ch_in_norm = AFFY_JUSTRMA_NORM.out.expression + + ch_affy_platform_features = AFFY_JUSTRMA_RAW.out.annotation - if ( params.gtf.endsWith('.gz') ){ - GUNZIP_GTF(file_gtf) - file_gtf = GUNZIP_GTF.out.gunzip - ch_versions = ch_versions.mix(GUNZIP_GTF.out.versions) } - // Get a features table from the GTF and combine with the matrix and sample - // annotation (fom = features/ observations/ matrix) + //// Fetch or derive a feature annotation table - GTF_TO_TABLE( file_gtf, [[ "id":""], []]) - ch_features = GTF_TO_TABLE.out.feature_annotation - .map{ - tuple( exp_meta, it[1]) - } + // If user has provided a feature annotation table, use that + if (params.features){ + ch_features = Channel.of([ exp_meta, file(params.features, checkIfExists: true)]) + } else if (params.study_type == 'affy_array'){ + ch_features = ch_affy_platform_features + } else if (params.gtf){ + // Get feature annotations from a GTF file, gunzip if necessary - // Record the version of the GTF -> table tool + file_gtf_in = file(params.gtf) + file_gtf = [ [ "id": file_gtf_in.simpleName ], file_gtf_in ] - ch_versions = ch_versions - .mix(GTF_TO_TABLE.out.versions) - } - else{ - if (params.study_type == 'maxquant'){ - ch_features = PROTEUS.out.norm_tab.map{ - matrix_as_anno_filename = "matrix_as_anno.${it[1].getExtension()}" - it[1].copyTo(matrix_as_anno_filename) - it[1] = file(matrix_as_anno_filename) - it + if ( params.gtf.endsWith('.gz') ){ + GUNZIP_GTF(file_gtf) + file_gtf = GUNZIP_GTF.out.gunzip + ch_versions = ch_versions.mix(GUNZIP_GTF.out.versions) } + + // Get a features table from the GTF and combine with the matrix and sample + // annotation (fom = features/ observations/ matrix) + + GTF_TO_TABLE( file_gtf, [[ "id":""], []]) + ch_features = GTF_TO_TABLE.out.feature_annotation + .map{ + tuple( exp_meta, it[1]) + } + + // Record the version of the GTF -> table tool + + ch_versions = ch_versions + .mix(GTF_TO_TABLE.out.versions) + } else { - } - else { // Otherwise we can just use the matrix input - matrix_as_anno_filename = "matrix_as_anno.${matrix_file.getExtension()}" - matrix_file.copyTo(matrix_as_anno_filename) - ch_features = Channel.of([ exp_meta, file(matrix_as_anno_filename)]) + matrix_as_anno_filename = "matrix_as_anno.${matrix_file.getExtension()}" + matrix_file.copyTo(matrix_as_anno_filename) + ch_features = Channel.of([ exp_meta, file(matrix_as_anno_filename)]) } - } - // Channel for the contrasts file + // Channel for the contrasts file - ch_contrasts_file = Channel.from([[exp_meta, file(params.contrasts)]]) + ch_contrasts_file = Channel.from([[exp_meta, file(params.contrasts)]]) - // Check compatibility of FOM elements and contrasts + // Check compatibility of FOM elements and contrasts - if (params.study_type == 'affy_array'){ - ch_matrices_for_validation = ch_in_raw - .join(ch_in_norm) - .map{tuple(it[0], [it[1], it[2]])} - } else if (params.study_type == 'maxquant') { - ch_matrices_for_validation = ch_in_raw - .join(ch_in_norm) - .map{tuple(it[0], [it[1], it[2]])} - } - else{ - ch_matrices_for_validation = ch_in_raw - } - - VALIDATOR( - ch_input.join(ch_matrices_for_validation), - ch_features, - ch_contrasts_file - ) - - // For Affy, we've validated multiple input matrices for raw and norm, - // we'll separate them out again here + if (params.study_type == 'affy_array'){ + ch_matrices_for_validation = ch_in_raw + .join(ch_in_norm) + .map{tuple(it[0], [it[1], it[2]])} + } else { + ch_matrices_for_validation = ch_in_raw + } - if (params.study_type == 'affy_array' || params.study_type == 'maxquant'){ - ch_validated_assays = VALIDATOR.out.assays - .transpose() - .branch { - raw: it[1].name.contains('raw') - normalised: it[1].name.contains('normalised') || it[1].name.contains('normalized') - } - ch_raw = ch_validated_assays.raw - ch_norm = ch_validated_assays.normalised - ch_matrix_for_differential = ch_norm + VALIDATOR( + ch_input.join(ch_matrices_for_validation), + ch_features, + ch_contrasts_file + ) - } else{ - ch_raw = VALIDATOR.out.assays - ch_matrix_for_differential = ch_raw - } + // For Affy, we've validated multiple input matrices for raw and norm, + // we'll separate them out again here + + if (params.study_type == 'affy_array'){ + ch_validated_assays = VALIDATOR.out.assays + .transpose() + .branch { + raw: it[1].name.contains('raw') + normalised: it[1].name.contains('normalised') + } + ch_raw = ch_validated_assays.raw + ch_norm = ch_validated_assays.normalised + ch_matrix_for_differential = ch_norm + + } else { + ch_raw = VALIDATOR.out.assays + ch_matrix_for_differential = ch_raw + } - // Split the contrasts up so we can run differential analyses and - // downstream plots separately. - // Replace NA strings that might have snuck into the blocking column + // Split the contrasts up so we can run differential analyses and + // downstream plots separately. + // Replace NA strings that might have snuck into the blocking column - ch_contrasts = VALIDATOR.out.contrasts - .map{it[1]} - .splitCsv ( header:true, sep:'\t' ) - .map{ - it.blocking = it.blocking.replace('NA', '') - if (!it.id){ - it.id = it.values().join('_') + ch_contrasts = VALIDATOR.out.contrasts + .map{it[1]} + .splitCsv ( header:true, sep:'\t' ) + .map{ + it.blocking = it.blocking.replace('NA', '') + if (!it.id){ + it.id = it.values().join('_') + } + tuple(it, it.variable, it.reference, it.target) } - tuple(it, it.variable, it.reference, it.target) - } - if (params.study_type == 'maxquant') { - ch_samples_and_matrix = VALIDATOR.out.sample_meta - .join(ch_matrix_for_differential) // -> meta, samplesheet, unfiltered matrix - .first() + // Firstly Filter the input matrix - } else { - // Firstly Filter the input matrix - ch_matrix_for_differential - VALIDATOR.out.sample_meta - CUSTOM_MATRIXFILTER( - ch_matrix_for_differential, - VALIDATOR.out.sample_meta - ) + CUSTOM_MATRIXFILTER( + ch_matrix_for_differential, + VALIDATOR.out.sample_meta + ) - // Prepare inputs for differential processes - ch_samples_and_matrix = VALIDATOR.out.sample_meta - .join(CUSTOM_MATRIXFILTER.out.filtered) // -> meta, samplesheet, filtered matrix - .first() + // Prepare inputs for differential processes + ch_samples_and_matrix = VALIDATOR.out.sample_meta + .join(CUSTOM_MATRIXFILTER.out.filtered) // -> meta, samplesheet, filtered matrix + .first() } if (params.study_type == 'affy_array' || params.study_type == 'maxquant'){ @@ -399,15 +441,6 @@ workflow DIFFERENTIALABUNDANCE { } .unique() - ch_all_matrices = VALIDATOR.out.sample_meta // meta, samples - .join(VALIDATOR.out.feature_meta) // meta, samples, features - .join(ch_raw) // meta, samples, features, raw matrix - .combine(ch_processed_matrices) // meta, samples, features, raw, norm, ... - .map{ - tuple(it[0], it[1], it[2], it[3..it.size()-1]) - } - .first() - ch_contrast_variables .combine(ch_all_matrices.map{ it.tail() }) @@ -429,9 +462,12 @@ workflow DIFFERENTIALABUNDANCE { // Gather software versions ch_versions = ch_versions - .mix(VALIDATOR.out.versions) .mix(PLOT_EXPLORATORY.out.versions) .mix(PLOT_DIFFERENTIAL.out.versions) + if (params.study_type != 'maxquant') { + ch_versions = ch_versions + .mix(VALIDATOR.out.versions) + } CUSTOM_DUMPSOFTWAREVERSIONS ( ch_versions.unique().collectFile(name: 'collated_versions.yml') @@ -509,7 +545,7 @@ workflow DIFFERENTIALABUNDANCE { } // Render the final report - ch_report_params + RMARKDOWNNOTEBOOK( ch_report_file, ch_report_params, From 4b8d5bc71c21c1dc55da0d2200bfb25be3769d38 Mon Sep 17 00:00:00 2001 From: WackerO Date: Wed, 23 Aug 2023 11:35:42 +0200 Subject: [PATCH 12/30] Changed list format of --features_log2_assays --- assets/differentialabundance_report.Rmd | 80 ++++++------------- conf/modules.config | 2 +- conf/test_maxquant.config | 8 +- .../shinyngs/staticexploratory/main.nf | 4 +- nextflow.config | 1 + nextflow_schema.json | 5 ++ workflows/differentialabundance.nf | 6 +- 7 files changed, 38 insertions(+), 68 deletions(-) diff --git a/assets/differentialabundance_report.Rmd b/assets/differentialabundance_report.Rmd index a92c97ff..cb711dc7 100644 --- a/assets/differentialabundance_report.Rmd +++ b/assets/differentialabundance_report.Rmd @@ -148,28 +148,9 @@ datatable(NULL) ``` ```{r, include=FALSE} -custom_num_handler <- function(i) { - tryCatch( - { - as.numeric(i) - # write(paste0("Tying to convert argument", i, "to numeric."), file="/home-link/iivow01/git/differentialabundance/error/trystuff", append=T) - - }, - error=function(e) { - write(paste0("Could not convert argument", i, "to numeric because of the following error\n", e, "\nWill treat as character."), file="/home-link/iivow01/git/differentialabundance/error/errorstuff", append=T) - as.character(i) - }, - warning=function(w) { - print(paste0("Could not convert argument", i, "to numeric because of the following warning\n", w, "\nWill treat as character.")) - write(paste0("Could not convert argument", i, "to numeric because of the following error\n", e, "\nWill treat as character."), file="/home-link/iivow01/git/differentialabundance/error/warnstuff", append=T) - as.character(i) - } - ) -} - -versions <- unlist(yaml.load_file(file.path(params$input_dir, params$versions_file), handlers=list("float#fix"=custom_num_handler)), recursive = FALSE) +versions <- unlist(yaml.load_file(file.path(params$input_dir, params$versions_file)), recursive = FALSE) params_table <- data.frame(Parameter = names(unlist(params)), Value = unlist(params), row.names = NULL) -write.table(params_table, file="/home-link/iivow01/git/differentialabundance/error/pars.tsv", sep="\t", quote=F) + # We'll subset the params table for different report sections make_params_table <- function(name, pattern = NULL, remove_pattern = FALSE){ subparams <- params_table @@ -250,26 +231,10 @@ informative_variables <- informative_variables[ ! duplicated(lapply(structure(in assay_names <- simpleSplit(params$exploratory_assay_names) names(assay_names) = assay_names assay_files <- lapply(assay_names, function(x) params[[paste0(x, '_matrix')]]) -write(length(assay_files), file="/home-link/iivow01/git/differentialabundance/error/assay_length") # Set up vector of unlogged assay files (if any) unlogged <- c() -features_log2_assays <- gsub('\\]$', '', gsub('^\\[', '', params$features_log2_assays)) -if (params$study_type == "maxquant") { - # Overwrite param for maxquant as the proteus module always logs - # features_log2_assays <- "0" -} - - - - -capture.output(params$limma_stdev_coef_lim, file="/home-link/iivow01/git/differentialabundance/error/aaaaa") -capture.output(typeof(params$features_log2_assays), file="/home-link/iivow01/git/differentialabundance/error/aaaaa", append=T) -capture.output(params$features_log2_assays, file="/home-link/iivow01/git/differentialabundance/error/aaaaa", append=T) -capture.output(typeof(params$features_log2_assays), file="/home-link/iivow01/git/differentialabundance/error/aaaaa", append=T) -capture.output(features_log2_assays, file="/home-link/iivow01/git/differentialabundance/error/aaaaa", append=T) -capture.output(typeof(features_log2_assays), file="/home-link/iivow01/git/differentialabundance/error/aaaaa", append=T) -capture.output(params, file="/home-link/iivow01/git/differentialabundance/error/aaaaa", append=T) +features_log2_assays <- params$features_log2_assays if (is.null(features_log2_assays)) { # Guess unlogged assays @@ -279,28 +244,31 @@ if (is.null(features_log2_assays)) { } } -} else if (features_log2_assays == "") { - # Do nothing as no assay files are unlogged -capture.output("Do nothing as no assay files are unlogged", file="/home-link/iivow01/git/differentialabundance/error/aaaaa", append=T) +} else { + features_log2_assays <- gsub('\\]$', '', gsub('^\\[', '', features_log2_assays)) # Remove brackets from assay list -} else if (is_valid_positive_integer_vector(features_log2_assays)) { - # Get assay names at indicated positions - unlogged <- unique(as.integer(simpleSplit(features_log2_assays))) - invalid_assays <- unlogged[! unlogged %in% 1:length(assay_files)] - if (length(invalid_assays) > 0){ - stop(paste0("Invalid assay numbers: ", paste(invalid_assays, collapse=', '))) - } + if (features_log2_assays == "") { + # Do nothing as no assay files are unlogged -} else { - # Last option is string of assay names, so just split it into a list - unlogged <- unique(simpleSplit(features_log2_assays)) - # Check if all names are valid - invalid_assays <- paste(unlogged[!(unlogged %in% names(assay_files))], collapse=", ") - if (length(invalid_assays) > 0) { - stop(paste0(invalid_assays, " is/are not valid assay name(s). Valid assay names are: ", paste(names(assay_files), collapse=", "), ". Please check param --features_log2_assays.")) + } else if (is_valid_positive_integer_vector(features_log2_assays)) { + # Get assay names at indicated positions + unlogged <- unique(as.integer(simpleSplit(features_log2_assays))) + invalid_assays <- unlogged[! unlogged %in% 1:length(assay_files)] + if (length(invalid_assays) > 0){ + stop(paste0("Invalid assay numbers: ", paste(invalid_assays, collapse=', '))) + } + + } else { + # Last option is string of assay names, so just split it into a list + unlogged <- unique(simpleSplit(features_log2_assays)) + # Check if all names are valid + invalid_assays <- paste(unlogged[!(unlogged %in% names(assay_files))], collapse=", ") + if (length(invalid_assays) > 0) { + stop(paste0(invalid_assays, " is/are not valid assay name(s). Valid assay names are: ", paste(names(assay_files), collapse=", "), ". Please check param --features_log2_assays.")) + } } } -asdasd + assay_data <- lapply(names(assay_files), function(x) { mat <- na.omit( read_matrix( diff --git a/conf/modules.config b/conf/modules.config index fcde245a..eaf82ddb 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -299,7 +299,7 @@ process { "--final_assay \"${params.exploratory_final_assay}\"", "--outlier_mad_threshold ${params.exploratory_mad_threshold}", "--palette_name \"${params.exploratory_palette_name}\"", - ( ((params.features_log2_assays == null) ? '' : "--log2_assays \"$params.features_log2_assays\"")) + ( ((params.features_log2_assays == null) ? '' : "--log2_assays \"$params.features_log2_assays\"") ).replace('[', '').replace(']', '') ].join(' ').trim() } } diff --git a/conf/test_maxquant.config b/conf/test_maxquant.config index 353a51f4..38315b0e 100644 --- a/conf/test_maxquant.config +++ b/conf/test_maxquant.config @@ -24,14 +24,14 @@ params { max_time = '6.h' // Input data - //TODO - input = '/home-link/iivow01/git/differentialabundance/testdata/maxquant/MaxQuant_samplesheet.tsv' // 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/proteomics/maxquant/MaxQuant_samplesheet.tsv' - matrix = '/home-link/iivow01/git/differentialabundance/testdata/maxquant/MaxQuant_proteinGroups.txt' // 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/proteomics/maxquant/MaxQuant_proteinGroups.txt' - contrasts = '/home-link/iivow01/git/differentialabundance/testdata/maxquant/MaxQuant_contrasts.csv' // 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/proteomics/maxquant/MaxQuant_contrasts.csv' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/proteomics/maxquant/MaxQuant_samplesheet.tsv' + matrix = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/proteomics/maxquant/MaxQuant_proteinGroups.txt' + contrasts = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/proteomics/maxquant/MaxQuant_contrasts.csv' // Features features_id_col = 'Majority protein IDs' features_metadata_cols = "Majority protein IDs" + features_log2_assays = "[]" // Observations observations_id_col = 'Experiment' diff --git a/modules/nf-core/shinyngs/staticexploratory/main.nf b/modules/nf-core/shinyngs/staticexploratory/main.nf index f9742f2a..2c351949 100644 --- a/modules/nf-core/shinyngs/staticexploratory/main.nf +++ b/modules/nf-core/shinyngs/staticexploratory/main.nf @@ -33,9 +33,7 @@ process SHINYNGS_STATICEXPLORATORY { def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: meta.id """ - echo $args > /home-link/iivow01/git/differentialabundance/error/explo_params - #exploratory_plots.R \\ - /home-link/iivow01/git/differentialabundance/save_stuff/templates/explo_before_param_change.R \\ + exploratory_plots.R \\ --sample_metadata "$sample" \\ --feature_metadata "$feature_meta" \\ --assay_files "${assay_files.join(',')}" \\ diff --git a/nextflow.config b/nextflow.config index 2e59d297..959e9f67 100644 --- a/nextflow.config +++ b/nextflow.config @@ -39,6 +39,7 @@ params { features_id_col = 'gene_id' features_name_col = 'gene_name' features_metadata_cols = 'gene_id,gene_name,gene_biotype' + features_log2_assays = null // GTF parsing options features_gtf_feature_type = 'transcript' diff --git a/nextflow_schema.json b/nextflow_schema.json index 127c797b..6ae509da 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -178,6 +178,11 @@ "default": "gene_id", "description": "Where a GTF file is supplied, which field should go first in the converted output table", "fa_icon": "fas fa-fast-backward" + }, + "features_log2_assays": { + "type": "string", + "description": "Of which assays to compute the log2. Not necessary for maxquant data as this is controlled by the pipeline.", + "help_text": "Either comma-separated list of assay names enclosed by square brackets, e.g. '[raw,normalised]'; or list of assay positions, e.g. '[1,2,3]', or empty list '[]' to not log any assay. If not set, will guess which assays need to be logged (those with a maximum > 20)." } }, "required": ["features_id_col", "features_name_col", "features_type"], diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index 8ae45669..448cd4fd 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -564,9 +564,7 @@ workflow DIFFERENTIALABUNDANCE { if (params.study_type == 'affy_array' || params.study_type == 'maxquant'){ params_pattern = ~/^(report|study|observations|features|filtering|exploratory|differential|affy|limma|gsea).*/ } - print "params" - print params - print "params end" + ch_report_params = ch_report_input_files .map{ params.findAll{ k,v -> k.matches(params_pattern) } + @@ -577,7 +575,7 @@ workflow DIFFERENTIALABUNDANCE { RMARKDOWNNOTEBOOK( ch_report_file, - ch_report_params.dump(tag:'params'), + ch_report_params, ch_report_input_files ) From c844121d716a2c879e282f6b57e03dac62bc69e6 Mon Sep 17 00:00:00 2001 From: WackerO Date: Thu, 24 Aug 2023 11:30:07 +0200 Subject: [PATCH 13/30] Some fixes of log2_assays --- assets/differentialabundance_report.Rmd | 21 ++++++++++++++++----- conf/modules.config | 2 +- conf/test_maxquant.config | 3 +-- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/assets/differentialabundance_report.Rmd b/assets/differentialabundance_report.Rmd index cb711dc7..e5c97cc5 100644 --- a/assets/differentialabundance_report.Rmd +++ b/assets/differentialabundance_report.Rmd @@ -236,9 +236,17 @@ assay_files <- lapply(assay_names, function(x) params[[paste0(x, '_matrix')]]) unlogged <- c() features_log2_assays <- params$features_log2_assays +# For maxquant input override param as all assays are already logged by proteus +if (params$study_type == 'maxquant') { + features_log2_assays <- "" + if (! is.null(features_log2_assays)) { + print("Got maxquant dataset, will override --features_log2_assays parameter so as not to log the assays twice.") + } +} + if (is.null(features_log2_assays)) { # Guess unlogged assays - for (assay in names(assay_files)) { + for (assay in c(1:length(assay_files))) { if (max(assay_files[[assay]]) > 20) { unlogged <- append(unlogged, assay) } @@ -251,25 +259,28 @@ if (is.null(features_log2_assays)) { # Do nothing as no assay files are unlogged } else if (is_valid_positive_integer_vector(features_log2_assays)) { - # Get assay names at indicated positions + # Convert to list of assay positions unlogged <- unique(as.integer(simpleSplit(features_log2_assays))) invalid_assays <- unlogged[! unlogged %in% 1:length(assay_files)] + if (length(invalid_assays) > 0){ stop(paste0("Invalid assay numbers: ", paste(invalid_assays, collapse=', '))) } } else { - # Last option is string of assay names, so just split it into a list + # Last option is string of assay names, so get positions of those names in the assay list unlogged <- unique(simpleSplit(features_log2_assays)) + # Check if all names are valid - invalid_assays <- paste(unlogged[!(unlogged %in% names(assay_files))], collapse=", ") + invalid_assays <- unlogged[!(unlogged %in% names(assay_files))] if (length(invalid_assays) > 0) { stop(paste0(invalid_assays, " is/are not valid assay name(s). Valid assay names are: ", paste(names(assay_files), collapse=", "), ". Please check param --features_log2_assays.")) } + unlogged <- match(unlogged, names(assay_files)) } } -assay_data <- lapply(names(assay_files), function(x) { +assay_data <- lapply(c(1:length(assay_files)), function(x) { mat <- na.omit( read_matrix( assay_files[[x]], diff --git a/conf/modules.config b/conf/modules.config index eaf82ddb..cd7ab99b 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -299,7 +299,7 @@ process { "--final_assay \"${params.exploratory_final_assay}\"", "--outlier_mad_threshold ${params.exploratory_mad_threshold}", "--palette_name \"${params.exploratory_palette_name}\"", - ( ((params.features_log2_assays == null) ? '' : "--log2_assays \"$params.features_log2_assays\"") ).replace('[', '').replace(']', '') + ( (params.study_type == 'maxquant') ? "--log2_assays ''" : (((params.features_log2_assays == null) ? '' : "--log2_assays \"$params.features_log2_assays\"".replace('[', '').replace(']', ''))) ) ].join(' ').trim() } } diff --git a/conf/test_maxquant.config b/conf/test_maxquant.config index 38315b0e..4841f5e5 100644 --- a/conf/test_maxquant.config +++ b/conf/test_maxquant.config @@ -31,12 +31,11 @@ params { // Features features_id_col = 'Majority protein IDs' features_metadata_cols = "Majority protein IDs" - features_log2_assays = "[]" // Observations observations_id_col = 'Experiment' observations_name_col = 'Name' - maxquant_measurecol_prefix = 'LFQ intensity ' + proteus_measurecol_prefix = 'LFQ intensity ' // Exploratory exploratory_main_variable = 'Celltype' From 41e8cda8d1a5be03f44e15cdbd8ea5b8e9b0b183 Mon Sep 17 00:00:00 2001 From: WackerO Date: Thu, 24 Aug 2023 11:34:25 +0200 Subject: [PATCH 14/30] Removed process def from test_maxquant.config as it is not anymore necessary --- conf/test_maxquant.config | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/conf/test_maxquant.config b/conf/test_maxquant.config index 4841f5e5..55fa8690 100644 --- a/conf/test_maxquant.config +++ b/conf/test_maxquant.config @@ -47,13 +47,3 @@ params { differential_fc_column = 'logFC' differential_qval_column = 'adj.P.Val' } - -// This is necessary so that some of the parameters changed above are actually accepted by the pipeline (unless using -params-file test_maxquant.yml) -process { - withName: VALIDATOR { - publishDir = [ - enabled: false - ] - ext.args = "--sample_id_col '${params.observations_id_col}' --feature_id_col '${params.features_id_col}'" - } -} From b71d3ea75883ffd2e1c5a2baad498b0a6fa596ef Mon Sep 17 00:00:00 2001 From: WackerO Date: Thu, 24 Aug 2023 14:35:41 +0200 Subject: [PATCH 15/30] More cleanup, comment/docu changes --- assets/differentialabundance_report.Rmd | 6 +++++- conf/modules.config | 1 - docs/usage.md | 2 +- nextflow_schema.json | 4 ++-- workflows/differentialabundance.nf | 2 +- 5 files changed, 9 insertions(+), 6 deletions(-) diff --git a/assets/differentialabundance_report.Rmd b/assets/differentialabundance_report.Rmd index e5c97cc5..9496a2f1 100644 --- a/assets/differentialabundance_report.Rmd +++ b/assets/differentialabundance_report.Rmd @@ -228,6 +228,7 @@ informative_variables <- unique(c(contrasts$variable, chooseGroupingVariables(ob # Remove any informative variables that group observations the same way informative_variables <- informative_variables[ ! duplicated(lapply(structure(informative_variables, names= informative_variables), function(x) as.numeric(factor(observations[[x]], levels=unique(observations[[x]])))))] + assay_names <- simpleSplit(params$exploratory_assay_names) names(assay_names) = assay_names assay_files <- lapply(assay_names, function(x) params[[paste0(x, '_matrix')]]) @@ -297,6 +298,8 @@ assay_data <- lapply(c(1:length(assay_files)), function(x) { mat }) + +# Lapply does not go over the assay_files themselves anymore, so we need to specifically assign their names to the data names(assay_data) <- names(assay_files) # Now we can rename the observations rows using the title field @@ -308,6 +311,7 @@ pca_datas <- lapply(names(assay_data), function(assay_type){ compilePCAData(assay_data[[assay_type]]) }) names(pca_datas) <- names(assay_data) + pca_vs_meta <- anova_pca_metadata(pca_datas[[params$exploratory_final_assay]]$coords, observations[,informative_variables, drop = FALSE], pca_datas[[params$exploratory_final_assay]]$percentVar) # Show the variable with the tightest PC associations first @@ -460,6 +464,7 @@ if (all(minimal_fetchngs_cols %in% colnames(observations))){ }else{ additional_useful_cols <- colnames(observations)[which(apply(observations, 2, function(x) max(nchar(x))) <= 20)] } + display_columns <- head(union(display_columns, additional_useful_cols), 5) # Also add informative columns @@ -640,7 +645,6 @@ for (assay_type in rev(names(assay_data))){ variable_genes <- selectVariableGenes(matrix = assay_data[[assay_type]], ntop = params$exploratory_n_features) dendroColorScale <- makeColorScale(length(unique(observations[[iv]])), palette = params$exploratory_palette_name) - p <- clusteringDendrogram( 2^assay_data[[assay_type]][variable_genes, ], observations[, iv, drop = FALSE], diff --git a/conf/modules.config b/conf/modules.config index cd7ab99b..d045f2a4 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -35,7 +35,6 @@ process { ext.args = "--feature-type '${params.features_gtf_feature_type}' --first-field '${params.features_gtf_table_first_field}'" } - withName: VALIDATOR { publishDir = [ enabled: false diff --git a/docs/usage.md b/docs/usage.md index bb1bda4d..5718d507 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -115,7 +115,7 @@ To override the above options, you may also supply your own features table as a --features '[path to features TSV]' ``` -By default, if you don't provide features, for non-array data the workflow will fall back to attempting to use the matrix itself as a source of feature annotations. For this to work you must make sure to set the `features_id_col`, `features_name_col` and `features_metadata_cols` parameters to the appropriate values, for example by setting them to 'gene_id' if that is the identifier column on the matrix. This will cause the gene ID to be used everywhere rather than more accessible gene symbols (as can be derived from the GTF), but the workflow should run. Please use this option for MaxQuant analysis. +By default, if you don't provide features, for non-array data the workflow will fall back to attempting to use the matrix itself as a source of feature annotations. For this to work you must make sure to set the `features_id_col`, `features_name_col` and `features_metadata_cols` parameters to the appropriate values, for example by setting them to 'gene_id' if that is the identifier column on the matrix. This will cause the gene ID to be used everywhere rather than more accessible gene symbols (as can be derived from the GTF), but the workflow should run. Please use this option for MaxQuant analysis, i.e. do not provide features. ## Shiny app generation diff --git a/nextflow_schema.json b/nextflow_schema.json index 6ae509da..0f23c916 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -182,7 +182,7 @@ "features_log2_assays": { "type": "string", "description": "Of which assays to compute the log2. Not necessary for maxquant data as this is controlled by the pipeline.", - "help_text": "Either comma-separated list of assay names enclosed by square brackets, e.g. '[raw,normalised]'; or list of assay positions, e.g. '[1,2,3]', or empty list '[]' to not log any assay. If not set, will guess which assays need to be logged (those with a maximum > 20)." + "help_text": "Either comma-separated of assay positions, e.g. '[1,2,3]', or empty list '[]' to not log any assay. If not set, will guess which assays need to be logged (those with a maximum > 20)." } }, "required": ["features_id_col", "features_name_col", "features_type"], @@ -268,7 +268,7 @@ "proteus_plotSD_method": { "type": "string", "default": "violin", - "description": "Which method to use for plotting sample distributions of the MaxQuant intensities.", + "description": "Which method to use for plotting sample distributions of the MaxQuant intensities; one of 'violin', 'dist', 'box'.", "help_text": "'violin', 'dist' or 'box'", "enum": ["violin", "dist", "box"] }, diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index 448cd4fd..f5546fe8 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -570,7 +570,7 @@ workflow DIFFERENTIALABUNDANCE { params.findAll{ k,v -> k.matches(params_pattern) } + [report_file_names, it.collect{ f -> f.name}].transpose().collectEntries() } - + // Render the final report RMARKDOWNNOTEBOOK( From a6f723f4e1969069a7467a9cdd21bc3d4c0b8b4e Mon Sep 17 00:00:00 2001 From: WackerO Date: Thu, 24 Aug 2023 15:22:27 +0200 Subject: [PATCH 16/30] Updated output doc --- docs/output.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/docs/output.md b/docs/output.md index f93e90be..7f3ba2ad 100644 --- a/docs/output.md +++ b/docs/output.md @@ -37,6 +37,12 @@ Stand-alone graphical outputs are placed in this directory. They may be useful i - `[contrast]/png/volcano.png`: Volcano plots of -log(10) p value agains log(2) fold changes - `gsea/`: Directory containing graphical outputs from GSEA (where enabled). Plots are stored in directories named for the associated contrast. - `[contrast]/png/[gsea_plot_type].png` + - `proteus/`: If `--study_type maxquant`: Directory containing plots produced by the proteus module which is used for processing MaxQuant input. Files are prefixed with the associated contrast and chosen normalization function (if any). + - `[contrast].proteus.[normfun].normalized_dendrogram.png`: A sample clustering dendrogram after normalization, if chosen. + - `[contrast].proteus.[normfun].normalized_mean_variance_relationship.png`: Plots of log intensity vs mean log intensity after normalization of each contrast level, if chosen. + - `[contrast].proteus.[normfun].normalized_distributions.png`: A plot of sample distributions after normalization, if chosen. + - `[contrast].proteus.raw_distributions.png`: A plot of sample distributions without normalization. +
@@ -61,6 +67,9 @@ Most plots are included in the HTML report (see above), but are also included in - `OR [contrast_name].limma.results.tsv`: Results of Limma differential analyis (Affymetrix arrays) - `gsea/`: Directory containing tables of differential gene set analyis from GSEA (where enabled) - `[contrast]/[contrast].gsea_report_for_[condition].tsv`: A GSEA report table for each side of each contrast + - `proteus/`: If `--study_type maxquant`: Directory containing abundance values produced by the proteus module which is used for processing MaxQuant input. Files are prefixed with the associated contrast and chosen normalization function (if any). + - `[contrast].proteus.[normfun].normalized_proteingroups_tab.tsv`: Abundance table after normalization, if chosen. + - `[contrast].proteus.raw_proteingroups_tab.tsv`: Abundance table without normalization. From 3078c8a0f6e922d334e32cc8f34d85c036a84863 Mon Sep 17 00:00:00 2001 From: WackerO Date: Thu, 24 Aug 2023 15:40:13 +0200 Subject: [PATCH 17/30] Fixed NULL being printed in the report.html --- assets/differentialabundance_report.Rmd | 40 ++++++++++++------------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/assets/differentialabundance_report.Rmd b/assets/differentialabundance_report.Rmd index 9496a2f1..714b046b 100644 --- a/assets/differentialabundance_report.Rmd +++ b/assets/differentialabundance_report.Rmd @@ -240,9 +240,6 @@ features_log2_assays <- params$features_log2_assays # For maxquant input override param as all assays are already logged by proteus if (params$study_type == 'maxquant') { features_log2_assays <- "" - if (! is.null(features_log2_assays)) { - print("Got maxquant dataset, will override --features_log2_assays parameter so as not to log the assays twice.") - } } if (is.null(features_log2_assays)) { @@ -256,28 +253,29 @@ if (is.null(features_log2_assays)) { } else { features_log2_assays <- gsub('\\]$', '', gsub('^\\[', '', features_log2_assays)) # Remove brackets from assay list - if (features_log2_assays == "") { - # Do nothing as no assay files are unlogged + if (features_log2_assays != "") { - } else if (is_valid_positive_integer_vector(features_log2_assays)) { - # Convert to list of assay positions - unlogged <- unique(as.integer(simpleSplit(features_log2_assays))) - invalid_assays <- unlogged[! unlogged %in% 1:length(assay_files)] + if (is_valid_positive_integer_vector(features_log2_assays)) { + # Convert to list of assay positions + unlogged <- unique(as.integer(simpleSplit(features_log2_assays))) + invalid_assays <- unlogged[! unlogged %in% 1:length(assay_files)] - if (length(invalid_assays) > 0){ - stop(paste0("Invalid assay numbers: ", paste(invalid_assays, collapse=', '))) - } + if (length(invalid_assays) > 0){ + stop(paste0("Invalid assay numbers: ", paste(invalid_assays, collapse=', '))) + } - } else { - # Last option is string of assay names, so get positions of those names in the assay list - unlogged <- unique(simpleSplit(features_log2_assays)) + } else { + # Last option is string of assay names, so get positions of those names in the assay list + unlogged <- unique(simpleSplit(features_log2_assays)) - # Check if all names are valid - invalid_assays <- unlogged[!(unlogged %in% names(assay_files))] - if (length(invalid_assays) > 0) { - stop(paste0(invalid_assays, " is/are not valid assay name(s). Valid assay names are: ", paste(names(assay_files), collapse=", "), ". Please check param --features_log2_assays.")) + # Check if all names are valid + invalid_assays <- unlogged[!(unlogged %in% names(assay_files))] + if (length(invalid_assays) > 0) { + stop(paste0(invalid_assays, " is/are not valid assay name(s). Valid assay names are: ", paste(names(assay_files), collapse=", "), ". Please check param --features_log2_assays.")) + } + unlogged <- match(unlogged, names(assay_files)) } - unlogged <- match(unlogged, names(assay_files)) + } } @@ -373,7 +371,6 @@ differential_results <- lapply(differential_files, function(diff_file){ } # Annotate differential tables if possible - if (! is.null(params$features)){ diff <- merge(features, diff, by.x = params$features_id_col, by.y = params$differential_feature_id_column) } @@ -645,6 +642,7 @@ for (assay_type in rev(names(assay_data))){ variable_genes <- selectVariableGenes(matrix = assay_data[[assay_type]], ntop = params$exploratory_n_features) dendroColorScale <- makeColorScale(length(unique(observations[[iv]])), palette = params$exploratory_palette_name) + p <- clusteringDendrogram( 2^assay_data[[assay_type]][variable_genes, ], observations[, iv, drop = FALSE], From efe8561fa671358cd870dbb9e6643c8790db236f Mon Sep 17 00:00:00 2001 From: WackerO Date: Thu, 24 Aug 2023 15:41:13 +0200 Subject: [PATCH 18/30] prettier --- docs/output.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/output.md b/docs/output.md index 7f3ba2ad..ea8fa0cf 100644 --- a/docs/output.md +++ b/docs/output.md @@ -42,7 +42,6 @@ Stand-alone graphical outputs are placed in this directory. They may be useful i - `[contrast].proteus.[normfun].normalized_mean_variance_relationship.png`: Plots of log intensity vs mean log intensity after normalization of each contrast level, if chosen. - `[contrast].proteus.[normfun].normalized_distributions.png`: A plot of sample distributions after normalization, if chosen. - `[contrast].proteus.raw_distributions.png`: A plot of sample distributions without normalization. - From cee6aba4ca7787a480fb0bb5ecc1ffdedf5b2bf7 Mon Sep 17 00:00:00 2001 From: WackerO Date: Mon, 28 Aug 2023 12:43:13 +0200 Subject: [PATCH 19/30] made workflow more similar to previous version --- workflows/differentialabundance.nf | 307 +++++++++++++---------------- 1 file changed, 139 insertions(+), 168 deletions(-) diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index f5546fe8..b27dc79e 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -110,7 +110,7 @@ include { RMARKDOWNNOTEBOOK } from '../modules/n include { AFFY_JUSTRMA as AFFY_JUSTRMA_RAW } from '../modules/nf-core/affy/justrma/main' include { AFFY_JUSTRMA as AFFY_JUSTRMA_NORM } from '../modules/nf-core/affy/justrma/main' include { PROTEUS_READPROTEINGROUPS as PROTEUS } from '../modules/nf-core/proteus/readproteingroups/main' - + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RUN MAIN WORKFLOW @@ -125,8 +125,39 @@ workflow DIFFERENTIALABUNDANCE { // Set up some basic variables ch_versions = Channel.empty() - // Check first if maxquant, as in that case the entire VALIDATE block can be skipped - if (params.study_type == 'maxquant') { + // If we have affy array data in the form of CEL files we'll be deriving + // matrix and annotation from them + + if (params.study_type == 'affy_array'){ + + // Uncompress the CEL files archive + + UNTAR ( ch_celfiles ) + + ch_affy_input = ch_input + .join(UNTAR.out.untar) + + // Run affy to derive the matrix. Reset the meta so it can be used to + // define a prefix for different matrix flavours + + AFFY_JUSTRMA_RAW ( + ch_affy_input, + [[],[]] + ) + AFFY_JUSTRMA_NORM ( + ch_affy_input, + [[],[]] + ) + + // Fetch affy outputs and reset the meta + + ch_in_raw = AFFY_JUSTRMA_RAW.out.expression + ch_in_norm = AFFY_JUSTRMA_NORM.out.expression + + ch_affy_platform_features = AFFY_JUSTRMA_RAW.out.annotation + + } else if (params.study_type == 'maxquant'){ + // For maxquant, run proteus module to import the protein abundances // Save contrasts file to channel ch_contrasts_file = Channel.from([[exp_meta, file(params.contrasts)]]) @@ -159,185 +190,131 @@ workflow DIFFERENTIALABUNDANCE { proteus_in, ch_contrasts_proteus ) - ch_raw = PROTEUS.out.raw_tab - ch_norm = PROTEUS.out.norm_tab + ch_in_raw = PROTEUS.out.raw_tab + ch_in_norm = PROTEUS.out.norm_tab + ch_versions = ch_versions.mix(PROTEUS.out.versions) + } + + //// Fetch or derive a feature annotation table + + // If user has provided a feature annotation table, use that + + if (params.features){ + ch_features = Channel.of([ exp_meta, file(params.features, checkIfExists: true)]) + } else if (params.study_type == 'affy_array'){ + ch_features = ch_affy_platform_features + } else if (params.study_type == 'maxquant'){ + + // For maxquant, we will use the processed matrices from PROTEUS ch_features = PROTEUS.out.norm_tab.map{ matrix_as_anno_filename = "matrix_as_anno.${it[1].getExtension()}" it[1].copyTo(matrix_as_anno_filename) // copy normalized outfile to use as fake "annotation" it[1] = file(matrix_as_anno_filename) it } - ch_versions = ch_versions.mix(PROTEUS.out.versions) - - // Filter the input matrix - - CUSTOM_MATRIXFILTER( - ch_norm, - ch_input - ) - - // Prepare inputs for differential processes - - ch_samples_and_matrix = ch_input - .join(CUSTOM_MATRIXFILTER.out.filtered) // -> meta, samplesheet, filtered matrix - .first() - - ch_processed_matrices = ch_norm - .map{ it.tail() } - .first() - - ch_all_matrices = ch_input // meta, samples - .join(ch_features) // meta, samples, features - .join(ch_raw) // meta, samples, features, raw matrix - .combine(ch_processed_matrices) // meta, samples, features, raw, norm, ... - .map{ - tuple(it[0], it[1], it[2], it[3..it.size()-1]) - } - .first() - - - } else { + } else if (params.gtf){ + // Get feature annotations from a GTF file, gunzip if necessary - // If we have affy array data in the form of CEL files we'll be deriving - // matrix and annotation from them - if (params.study_type == 'affy_array'){ - - // Uncompress the CEL files archive - - UNTAR ( ch_celfiles ) - - ch_affy_input = ch_input - .join(UNTAR.out.untar) - - // Run affy to derive the matrix. Reset the meta so it can be used to - // define a prefix for different matrix flavours - - AFFY_JUSTRMA_RAW ( - ch_affy_input, - [[],[]] - ) - AFFY_JUSTRMA_NORM ( - ch_affy_input, - [[],[]] - ) - - // Fetch affy outputs and reset the meta - - ch_in_raw = AFFY_JUSTRMA_RAW.out.expression - ch_in_norm = AFFY_JUSTRMA_NORM.out.expression - - ch_affy_platform_features = AFFY_JUSTRMA_RAW.out.annotation + file_gtf_in = file(params.gtf) + file_gtf = [ [ "id": file_gtf_in.simpleName ], file_gtf_in ] + if ( params.gtf.endsWith('.gz') ){ + GUNZIP_GTF(file_gtf) + file_gtf = GUNZIP_GTF.out.gunzip + ch_versions = ch_versions.mix(GUNZIP_GTF.out.versions) } - //// Fetch or derive a feature annotation table - - // If user has provided a feature annotation table, use that - if (params.features){ - ch_features = Channel.of([ exp_meta, file(params.features, checkIfExists: true)]) - } else if (params.study_type == 'affy_array'){ - ch_features = ch_affy_platform_features - } else if (params.gtf){ - // Get feature annotations from a GTF file, gunzip if necessary + // Get a features table from the GTF and combine with the matrix and sample + // annotation (fom = features/ observations/ matrix) - file_gtf_in = file(params.gtf) - file_gtf = [ [ "id": file_gtf_in.simpleName ], file_gtf_in ] - - if ( params.gtf.endsWith('.gz') ){ - GUNZIP_GTF(file_gtf) - file_gtf = GUNZIP_GTF.out.gunzip - ch_versions = ch_versions.mix(GUNZIP_GTF.out.versions) + GTF_TO_TABLE( file_gtf, [[ "id":""], []]) + ch_features = GTF_TO_TABLE.out.feature_annotation + .map{ + tuple( exp_meta, it[1]) } - // Get a features table from the GTF and combine with the matrix and sample - // annotation (fom = features/ observations/ matrix) + // Record the version of the GTF -> table tool - GTF_TO_TABLE( file_gtf, [[ "id":""], []]) - ch_features = GTF_TO_TABLE.out.feature_annotation - .map{ - tuple( exp_meta, it[1]) - } + ch_versions = ch_versions + .mix(GTF_TO_TABLE.out.versions) + } + else{ - // Record the version of the GTF -> table tool + // Otherwise we can just use the matrix input + matrix_as_anno_filename = "matrix_as_anno.${matrix_file.getExtension()}" + matrix_file.copyTo(matrix_as_anno_filename) + ch_features = Channel.of([ exp_meta, file(matrix_as_anno_filename)]) + } - ch_versions = ch_versions - .mix(GTF_TO_TABLE.out.versions) - } else { - - // Otherwise we can just use the matrix input - matrix_as_anno_filename = "matrix_as_anno.${matrix_file.getExtension()}" - matrix_file.copyTo(matrix_as_anno_filename) - ch_features = Channel.of([ exp_meta, file(matrix_as_anno_filename)]) - } + // Channel for the contrasts file - // Channel for the contrasts file + ch_contrasts_file = Channel.from([[exp_meta, file(params.contrasts)]]) - ch_contrasts_file = Channel.from([[exp_meta, file(params.contrasts)]]) + // Check compatibility of FOM elements and contrasts - // Check compatibility of FOM elements and contrasts + if (params.study_type == 'affy_array' || params.study_type == 'maxquant'){ + ch_matrices_for_validation = ch_in_raw + .join(ch_in_norm) + .map{tuple(it[0], [it[1], it[2]])} + } + else{ + ch_matrices_for_validation = ch_in_raw + } - if (params.study_type == 'affy_array'){ - ch_matrices_for_validation = ch_in_raw - .join(ch_in_norm) - .map{tuple(it[0], [it[1], it[2]])} - } else { - ch_matrices_for_validation = ch_in_raw - } + VALIDATOR( + ch_input.join(ch_matrices_for_validation), + ch_features, + ch_contrasts_file + ) - VALIDATOR( - ch_input.join(ch_matrices_for_validation), - ch_features, - ch_contrasts_file - ) + // For Affy, we've validated multiple input matrices for raw and norm, + // we'll separate them out again here - // For Affy, we've validated multiple input matrices for raw and norm, - // we'll separate them out again here + if (params.study_type == 'affy_array' || params.study_type == 'maxquant'){ + ch_validated_assays = VALIDATOR.out.assays + .transpose() + .branch { + raw: it[1].name.contains('raw') + normalised: it[1].name.contains('normalised') || it[1].name.contains('normalized') + } + ch_raw = ch_validated_assays.raw + ch_norm = ch_validated_assays.normalised + ch_matrix_for_differential = ch_norm + } else{ + ch_raw = VALIDATOR.out.assays + ch_matrix_for_differential = ch_raw + } - if (params.study_type == 'affy_array'){ - ch_validated_assays = VALIDATOR.out.assays - .transpose() - .branch { - raw: it[1].name.contains('raw') - normalised: it[1].name.contains('normalised') - } - ch_raw = ch_validated_assays.raw - ch_norm = ch_validated_assays.normalised - ch_matrix_for_differential = ch_norm + // Split the contrasts up so we can run differential analyses and + // downstream plots separately. + // Replace NA strings that might have snuck into the blocking column - } else { - ch_raw = VALIDATOR.out.assays - ch_matrix_for_differential = ch_raw + ch_contrasts = VALIDATOR.out.contrasts + .map{it[1]} + .splitCsv ( header:true, sep:'\t' ) + .map{ + it.blocking = it.blocking.replace('NA', '') + if (!it.id){ + it.id = it.values().join('_') + } + tuple(it, it.variable, it.reference, it.target) } - // Split the contrasts up so we can run differential analyses and - // downstream plots separately. - // Replace NA strings that might have snuck into the blocking column + // Firstly Filter the input matrix - ch_contrasts = VALIDATOR.out.contrasts - .map{it[1]} - .splitCsv ( header:true, sep:'\t' ) - .map{ - it.blocking = it.blocking.replace('NA', '') - if (!it.id){ - it.id = it.values().join('_') - } - tuple(it, it.variable, it.reference, it.target) - } + CUSTOM_MATRIXFILTER( + ch_matrix_for_differential, + VALIDATOR.out.sample_meta + ) - // Firstly Filter the input matrix + // Prepare inputs for differential processes - CUSTOM_MATRIXFILTER( - ch_matrix_for_differential, - VALIDATOR.out.sample_meta - ) - - // Prepare inputs for differential processes - ch_samples_and_matrix = VALIDATOR.out.sample_meta - .join(CUSTOM_MATRIXFILTER.out.filtered) // -> meta, samplesheet, filtered matrix - .first() - } + ch_samples_and_matrix = VALIDATOR.out.sample_meta + .join(CUSTOM_MATRIXFILTER.out.filtered) // -> meta, samplesheet, filtered matrix + .first() if (params.study_type == 'affy_array' || params.study_type == 'maxquant'){ + LIMMA_DIFFERENTIAL ( ch_contrasts, ch_samples_and_matrix @@ -388,18 +365,6 @@ workflow DIFFERENTIALABUNDANCE { .join(ch_vst) .map{ it.tail() } } - - if (params.study_type != 'maxquant') { - // VALIDATOR has now run for any study_type, so collect its output - ch_all_matrices = VALIDATOR.out.sample_meta // meta, samples - .join(VALIDATOR.out.feature_meta) // meta, samples, features - .join(ch_raw) // meta, samples, features, raw matrix - .combine(ch_processed_matrices) // meta, samples, features, raw, norm, ... - .map{ - tuple(it[0], it[1], it[2], it[3..it.size()-1]) - } - .first() - } // Run a gene set analysis where directed @@ -468,6 +433,15 @@ workflow DIFFERENTIALABUNDANCE { } .unique() + ch_all_matrices = VALIDATOR.out.sample_meta // meta, samples + .join(VALIDATOR.out.feature_meta) // meta, samples, features + .join(ch_raw) // meta, samples, features, raw matrix + .combine(ch_processed_matrices) // meta, samples, features, raw, norm, ... + .map{ + tuple(it[0], it[1], it[2], it[3..it.size()-1]) + } + .first() + ch_contrast_variables .combine(ch_all_matrices.map{ it.tail() }) @@ -489,12 +463,9 @@ workflow DIFFERENTIALABUNDANCE { // Gather software versions ch_versions = ch_versions + .mix(VALIDATOR.out.versions) .mix(PLOT_EXPLORATORY.out.versions) .mix(PLOT_DIFFERENTIAL.out.versions) - if (params.study_type != 'maxquant') { - ch_versions = ch_versions - .mix(VALIDATOR.out.versions) - } CUSTOM_DUMPSOFTWAREVERSIONS ( ch_versions.unique().collectFile(name: 'collated_versions.yml') @@ -561,7 +532,7 @@ workflow DIFFERENTIALABUNDANCE { // Condition params reported on study type def params_pattern = ~/^(report|study|observations|features|filtering|exploratory|differential|deseq2|gsea).*/ - if (params.study_type == 'affy_array' || params.study_type == 'maxquant'){ + if (params.study_type == 'affy_array'|| params.study_type == 'maxquant'){ params_pattern = ~/^(report|study|observations|features|filtering|exploratory|differential|affy|limma|gsea).*/ } From 6b7a0194039eb048882e16af75712cedecd47d9f Mon Sep 17 00:00:00 2001 From: WackerO Date: Mon, 2 Oct 2023 15:22:58 +0200 Subject: [PATCH 20/30] Pipeline finally runs again after changing proteus --- conf/modules.config | 19 ++- modules/nf-core/custom/matrixfilter/main.nf | 1 + .../matrixfilter/templates/matrixfilter.R | 44 +++++- .../limma/differential/templates/limma_de.R | 3 + .../nf-core/proteus/readproteingroups/main.nf | 4 +- .../templates/proteus_readproteingroups.R | 125 +++++++++--------- nextflow.config | 2 +- nextflow_schema.json | 2 +- workflows/differentialabundance.nf | 54 ++++---- 9 files changed, 146 insertions(+), 108 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 839b548f..840488f4 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -103,26 +103,33 @@ process { withName: PROTEUS { publishDir = [ [ - path: { "${params.outdir}/tables/proteus" }, + path: { "${params.outdir}/tables/proteus/${meta.id}/" }, mode: params.publish_dir_mode, - pattern: '*proteingroups_tab.tsv' + pattern: '*.tsv' ], [ - path: { "${params.outdir}/plots/proteus" }, + path: { "${params.outdir}/plots/proteus/${meta.id}/" }, mode: params.publish_dir_mode, pattern: '*.png' ], [ - path: { "${params.outdir}/other/proteus" }, + path: { "${params.outdir}/other/proteus/${meta.id}/" }, mode: params.publish_dir_mode, - pattern: '*.{rds,sessionInfo.log}' + pattern: '*.rds' + + ], + [ + path: { "${params.outdir}/other/proteus/" }, + mode: params.publish_dir_mode, + pattern: '*sessionInfo.log' ] ] ext.args = { [ + "--contrast_variable \"${meta.id}\"", "--sample_id_col \"${params.observations_id_col}\"", "--protein_id_col \"${params.features_id_col}\"", "--measure_col_prefix \"${params.proteus_measurecol_prefix}\"", - "--normfuns $params.proteus_norm_functions", + "--normfun $params.proteus_norm_function", "--plotSampleDistributions_method $params.proteus_plotSD_method", "--plotMV_loess $params.proteus_plotMV_loess", "--palette_name $params.proteus_palette_name", diff --git a/modules/nf-core/custom/matrixfilter/main.nf b/modules/nf-core/custom/matrixfilter/main.nf index f569fd6c..137918d7 100644 --- a/modules/nf-core/custom/matrixfilter/main.nf +++ b/modules/nf-core/custom/matrixfilter/main.nf @@ -12,6 +12,7 @@ process CUSTOM_MATRIXFILTER { output: tuple val(meta), path("*.filtered.tsv") , emit: filtered + tuple val(meta), path("*.tests.tsv") , emit: tests tuple val(meta), path("R_sessionInfo.log") , emit: session_info path "versions.yml" , emit: versions diff --git a/modules/nf-core/custom/matrixfilter/templates/matrixfilter.R b/modules/nf-core/custom/matrixfilter/templates/matrixfilter.R index 983e97cc..dc8b508f 100644 --- a/modules/nf-core/custom/matrixfilter/templates/matrixfilter.R +++ b/modules/nf-core/custom/matrixfilter/templates/matrixfilter.R @@ -78,7 +78,9 @@ opt <- list( minimum_abundance = 1, minimum_samples = 1, minimum_proportion = 0, - grouping_variable = NULL + grouping_variable = NULL, + minimum_proportion_not_na = 0.5, + minimum_samples_not_na = NULL ) opt_types <- lapply(opt, class) @@ -152,11 +154,29 @@ if ((opt\$sample_file != '') && ( ! is.null(opt\$grouping_variable))){ opt\$minimum_samples <- ncol(abundance_matrix) * opt\$minimum_proportion } -# Generate a boolean vector specifying the features to retain +# Also set up filtering for NAs; use by default minimum_proportion_not_na; only +# use minimum_samples_not_na if it is provided (default NULL) -keep <- apply(abundance_matrix, 1, function(x){ - sum(x > opt\$minimum_abundance) >= opt\$minimum_samples -}) +if (is.null(opt\$minimum_samples_not_na)) { + opt\$minimum_samples_not_na <- ncol(abundance_matrix) * opt\$minimum_proportion_not_na +} + +# Define the tests + +tests <- list( + 'abundance' = function(x) sum(x > opt\$minimum_abundance, na.rm = T) >= opt\$minimum_samples, + 'na' = function(x) !any(is.na(x)) || sum(!is.na(x))/length(x) >= opt\$minimum_samples_not_n +) + +# Apply the functions row-wise on the abundance_matrix and store the result in a boolean matrix + +boolean_matrix <- t(apply(abundance_matrix, 1, function(row) { + sapply(tests, function(f) f(row)) +})) + +# We will retain features passing all tests + +keep <- apply(boolean_matrix, 1, all) # Write out the matrix retaining the specified rows and re-prepending the # column with the feature identifiers @@ -175,6 +195,20 @@ write.table( quote = FALSE ) +# Write a boolean matrix returning specifying the status of each test + +write.table( + data.frame(rownames(abundance_matrix), boolean_matrix), + file = paste0( + prefix, + '.tests.tsv' + ), + col.names = c(feature_id_name, names(tests)), + row.names = FALSE, + sep = '\t', + quote = FALSE +) + ################################################ ################################################ ## R SESSION INFO ## diff --git a/modules/nf-core/limma/differential/templates/limma_de.R b/modules/nf-core/limma/differential/templates/limma_de.R index 5a80eb22..6acad566 100755 --- a/modules/nf-core/limma/differential/templates/limma_de.R +++ b/modules/nf-core/limma/differential/templates/limma_de.R @@ -149,6 +149,9 @@ library(limma) ################################################ ################################################ +file.copy(opt\$count_file, "/home-link/iivow01/git/differentialabundance/error_limma/count_file") +file.copy(opt\$sample_file, "/home-link/iivow01/git/differentialabundance/error_limma/sample_file") + intensities.table <- read_delim_flexible( file = opt\$count_file, diff --git a/modules/nf-core/proteus/readproteingroups/main.nf b/modules/nf-core/proteus/readproteingroups/main.nf index c7018a82..37126cfe 100644 --- a/modules/nf-core/proteus/readproteingroups/main.nf +++ b/modules/nf-core/proteus/readproteingroups/main.nf @@ -1,5 +1,5 @@ process PROTEUS_READPROTEINGROUPS { - tag "$meta" + tag "$meta.id" label 'process_single' conda "conda-forge::r-base=4.2.1 bioconda::r-proteus-bartongroup=0.2.16 conda-forge::r-plotly=4.10.2 bioconda::bioconductor-limma=3.54.0" @@ -9,8 +9,6 @@ process PROTEUS_READPROTEINGROUPS { input: tuple val(meta), path(samplesheet), path(intensities) - tuple val(meta2), val(contrast_variable) - output: tuple val(meta), path("*dendrogram.png") , emit: dendro_plot diff --git a/modules/nf-core/proteus/readproteingroups/templates/proteus_readproteingroups.R b/modules/nf-core/proteus/readproteingroups/templates/proteus_readproteingroups.R index ff75ea84..f335c0aa 100644 --- a/modules/nf-core/proteus/readproteingroups/templates/proteus_readproteingroups.R +++ b/modules/nf-core/proteus/readproteingroups/templates/proteus_readproteingroups.R @@ -124,11 +124,11 @@ round_dataframe_columns <- function(df, columns = NULL, digits = -1) { opt <- list( intensities_file = '$intensities', sample_file = '$samplesheet', - contrast_variable = '$contrast_variable', + contrast_variable = NULL, protein_id_col = 'Majority protein IDs', sample_id_col = 'sample', measure_col_prefix = 'intensities', - normfuns = 'normalizeMedian', + normfun = 'normalizeMedian', plotSampleDistributions_method = 'violin', plotMV_loess = T, palette_name = 'Set1', @@ -154,7 +154,7 @@ for ( ao in names(args_opt)) { # Check if required parameters have been provided -required_opts <- c('intensities_file', 'sample_file', 'contrast_variable') +required_opts <- c('contrast_variable') missing <- required_opts[unlist(lapply(opt[required_opts], is.null)) | ! required_opts %in% names(opt)] if (length(missing) > 0) { @@ -238,8 +238,6 @@ if (length(missing_columns) > 0) { ################################################ ################################################ -output_prefix <- opt\$contrast_variable - # Replace proteus default ID column with user param and re-set the names of the resulting object (gsub sets the names to NULL) proteinColumns <- setNames(gsub("Majority protein IDs", opt\$protein_id_col, proteus::proteinColumns), names(proteus::proteinColumns)) @@ -254,76 +252,71 @@ proteinGroups <- readProteinGroups( valid_normfuns <- list("normalizeMedian", "normalizeQuantiles") -# Generate plots for all requested normalizations; also, save normalized protein groups for limma - -for (normfun in unlist(strsplit(opt\$normfuns, ","))) { - if (! (normfun %in% valid_normfuns)) { - stop(paste0("Invalid normfuns argument: ", normfun, - ". Valid normfuns are: ", paste(valid_normfuns, collapse=", "), ".")) - } - - proteinGroups.normalized <- normalizeData(proteinGroups, norm.fun = eval(parse(text=normfun))) # Proteus also accepts other norm.funs, e.g. from limma +# Generate plots for requested normalization; also, save normalized protein groups for limma - # Apply log2 and remove NAs as these will otherwise mess with some of the following modules +if (! (opt\$normfun %in% valid_normfuns)) { + stop(paste0("Invalid normfun argument: ", opt\$normfun, + ". Valid normfuns are: ", paste(valid_normfuns, collapse=", "), ".")) +} - proteinGroups.normalized\$tab <- na.omit(log2(proteinGroups.normalized\$tab)) +proteinGroups.normalized <- normalizeData(proteinGroups, norm.fun = eval(parse(text=opt\$normfun))) # Proteus also accepts other norm.funs, e.g. from limma +proteinGroups.normalized\$tab <- log2(proteinGroups.normalized\$tab) - png(paste(output_prefix, 'proteus', normfun, 'normalized_distributions.png', sep='.'), width = 5*300, height = 5*300, res = 300, pointsize = 8) - print( - plotSampleDistributions(proteinGroups.normalized, title=paste0("Sample distributions after applying\n", normfun), fill="condition", method=opt\$plotSampleDistributions_method) - + scale_fill_brewer(palette=opt\$palette_name, name=opt\$contrast_variable) - + theme(plot.title = element_text(size = 12)) - ) - dev.off() +png(paste(opt\$normfun, 'normalized_distributions.png', sep='.'), width = 5*300, height = 5*300, res = 300, pointsize = 8) +print( + plotSampleDistributions(proteinGroups.normalized, title=paste0("Sample distributions after applying\n", opt\$normfun, " in contrast ", opt\$contrast_variable), fill="condition", method=opt\$plotSampleDistributions_method) + + scale_fill_brewer(palette=opt\$palette_name, name=opt\$contrast_variable) + + theme(plot.title = element_text(size = 12)) + ) +dev.off() - png(paste(output_prefix, 'proteus', normfun, 'normalized_mean_variance_relationship.png', sep='.'), width = 5*300, height = 5*300, res = 300, pointsize = 8) - print( - plotMV(proteinGroups.normalized, with.loess=opt\$plotMV_loess) - + ggtitle(paste0("Sample mean variance relationship after applying\n", normfun)) - + scale_fill_distiller(palette=opt\$palette_name) - + theme(plot.title = element_text(size = 12)) - ) - dev.off() +png(paste(opt\$normfun, 'normalized_mean_variance_relationship.png', sep='.'), width = 5*300, height = 5*300, res = 300, pointsize = 8) +print( + plotMV(proteinGroups.normalized, with.loess=opt\$plotMV_loess) + + ggtitle(paste0("Sample mean variance relationship after applying\n", opt\$normfun, " in contrast ", opt\$contrast_variable)) + + scale_fill_distiller(palette=opt\$palette_name) + + theme(plot.title = element_text(size = 12)) + ) +dev.off() - png(paste(output_prefix, 'proteus', normfun, 'normalized_dendrogram.png', sep='.'), width = 5*300, height = 5*300, res = 300, pointsize = 8) - print( - plotClustering(proteinGroups.normalized) - + ggtitle(paste0("Sample clustering after applying\n", normfun)) - + theme(plot.title = element_text(size = 12)) - ) - dev.off() +png(paste(opt\$normfun, 'normalized_dendrogram.png', sep='.'), width = 5*300, height = 5*300, res = 300, pointsize = 8) +print( + plotClustering(proteinGroups.normalized) + + ggtitle(paste0("Sample clustering after applying\n", opt\$normfun, " in contrast ", opt\$contrast_variable)) + + theme(plot.title = element_text(size = 12)) + ) +dev.off() - # R object for other processes to use +# R object for other processes to use - saveRDS(proteinGroups.normalized, file = paste(output_prefix, 'proteus', normfun, 'normalized_proteingroups.rds', sep='.')) +saveRDS(proteinGroups.normalized, file = paste(opt\$normfun, 'normalized_proteingroups.rds', sep='.')) - # Write normalized intensities matrix +# Write normalized intensities matrix - out_df <- data.frame( - round_dataframe_columns(proteinGroups.normalized\$tab, digits=opt\$round_digits), - check.names = FALSE - ) - out_df[[opt\$protein_id_col]] <- rownames(proteinGroups.normalized\$tab) # proteus saves the IDs as rownames; save these to a separate column - out_df <- out_df[c(opt\$protein_id_col, colnames(out_df)[colnames(out_df) != opt\$protein_id_col])] # move ID column to first position - write.table( - out_df, - file = paste(output_prefix, 'proteus', normfun, 'normalized_proteingroups_tab', 'tsv', sep = '.'), - col.names = TRUE, - row.names = FALSE, - sep = '\t', - quote = FALSE - ) -} +out_df <- data.frame( + round_dataframe_columns(proteinGroups.normalized\$tab, digits=opt\$round_digits), + check.names = FALSE +) +out_df[[opt\$protein_id_col]] <- rownames(proteinGroups.normalized\$tab) # proteus saves the IDs as rownames; save these to a separate column +out_df <- out_df[c(opt\$protein_id_col, colnames(out_df)[colnames(out_df) != opt\$protein_id_col])] # move ID column to first position +write.table( + out_df, + file = paste(opt\$normfun, 'normalized_proteingroups_tab', 'tsv', sep = '.'), + col.names = TRUE, + row.names = FALSE, + sep = '\t', + quote = FALSE +) # Process and save raw table -proteinGroups\$tab <- na.omit(log2(proteinGroups\$tab)) +proteinGroups\$tab <- log2(proteinGroups\$tab) # Generate raw distribution plot -png(paste(output_prefix, 'proteus.raw_distributions.png', sep='.'), width = 5*300, height = 5*300, res = 300, pointsize = 8) +png('raw_distributions.png', width = 5*300, height = 5*300, res = 300, pointsize = 8) print( - plotSampleDistributions(proteinGroups, title="Raw sample distributions", fill="condition", method=opt\$plotSampleDistributions_method) + plotSampleDistributions(proteinGroups, title=paste("Raw sample distributions in contrast", opt\$contrast_variable), fill="condition", method=opt\$plotSampleDistributions_method) + scale_fill_brewer(palette=opt\$palette_name, name=opt\$contrast_variable) + theme(plot.title = element_text(size = 12)) ) @@ -331,7 +324,8 @@ dev.off() # R object for other processes to use -saveRDS(proteinGroups, file = paste(output_prefix, 'proteus.raw_proteingroups.rds', sep = '.')) +saveRDS(proteinGroups, file = 'raw_proteingroups.rds') + # Write raw intensities matrix @@ -345,7 +339,16 @@ out_df <- out_df[c(opt\$protein_id_col, colnames(out_df)[colnames(out_df) != opt write.table( out_df, - file = paste(output_prefix, 'proteus', 'raw_proteingroups_tab', 'tsv', sep = '.'), + file = 'raw_proteingroups_tab.tsv', + col.names = TRUE, + row.names = FALSE, + sep = '\t', + quote = FALSE +) + +write.table( + out_df, + file = '/home-link/iivow01/git/save_differentialabundance/error/raw_proteingroups_tab.tsv', col.names = TRUE, row.names = FALSE, sep = '\t', diff --git a/nextflow.config b/nextflow.config index 6a0b06d4..a1b034da 100644 --- a/nextflow.config +++ b/nextflow.config @@ -60,7 +60,7 @@ params { // Proteus-specific options proteus_measurecol_prefix = 'LFQ intensity ' - proteus_norm_functions = 'normalizeMedian' + proteus_norm_function = 'normalizeMedian' proteus_plotSD_method = 'violin' proteus_plotMV_loess = true proteus_palette_name = 'Set1' diff --git a/nextflow_schema.json b/nextflow_schema.json index b231b85f..a370976c 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -266,7 +266,7 @@ "default": "LFQ intensity ", "description": "Prefix of the column names of the MaxQuant proteingroups table in which the intensity values are saved; the prefix has to be followed by the sample names that are also found in the samplesheet. Default: 'LFQ intensity '; take care to also consider trailing whitespace between prefix and samplenames." }, - "proteus_norm_functions": { + "proteus_norm_function": { "type": "string", "default": "normalizeMedian", "description": "Comma-separated string of normalization functions to use on the MaxQuant intensities.", diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index 30a21adf..83bb59ce 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -32,14 +32,18 @@ if (params.study_type == 'affy_array'){ // Should the user have enabled --shinyngs_build_app and/or --gsea_run, throw an error if (params.shinyngs_build_app) { + // This can be removed once shinyngs has an inbuilt NA handler error("Cannot build shinyngs app for maxquant data; please set --shinyngs_build_app to false.") } if (params.gsea_run) { error("Cannot run GSEA for maxquant data; please set --gsea_run to false.") } + if (!params.matrix) { + error("Input matrix not specified!") + } // Make channel for proteus - proteus_in = Channel.of([ exp_meta, file(params.input), file(params.matrix) ]) + proteus_in = Channel.of([ file(params.input), file(params.matrix) ]) } else if (params.study_type == 'geo_soft_file'){ // To pull SOFT files from a GEO a GSE study identifer must be provided @@ -136,6 +140,8 @@ workflow DIFFERENTIALABUNDANCE { // Set up some basic variables ch_versions = Channel.empty() + // Channel for the contrasts file + ch_contrasts_file = Channel.from([[exp_meta, file(params.contrasts)]]) // If we have affy array data in the form of CEL files we'll be deriving // matrix and annotation from them @@ -172,41 +178,32 @@ workflow DIFFERENTIALABUNDANCE { .mix(AFFY_JUSTRMA_RAW.out.versions) } else if (params.study_type == 'maxquant'){ - // For maxquant, run proteus module to import the protein abundances - - // Save contrasts file to channel - ch_contrasts_file = Channel.from([[exp_meta, file(params.contrasts)]]) // Split contrasts for proteus and for the later modules ch_contrasts_split = ch_contrasts_file .splitCsv ( header:true, sep:(params.contrasts.endsWith('tsv') ? '\t' : ',')) .map{ it.tail().first() } - // For proteus, extract only meta and contrast variable + // For proteus, extract only contrast variable as the module has to run once per contrast ch_contrasts_proteus = ch_contrasts_split .map{ - tuple( - exp_meta, // meta map - it.variable // contrast variable - ) - } - - // For the plotting and following modules, save all contrast info - ch_contrasts = ch_contrasts_split - .map{ - it.blocking = it.blocking.replace('NA', '') - if (!it.id){ - it.id = it.values().join('_') - } - tuple(it, it.variable, it.reference, it.target) + tuple('id': it.variable) } + .unique() + // Run proteus to import protein abundances PROTEUS( - proteus_in, - ch_contrasts_proteus + ch_contrasts_proteus.combine(proteus_in) ) + + // Re-map the proteus output tables to the study ID as the tables are the same across contrasts ch_in_raw = PROTEUS.out.raw_tab + .reduce{a, b -> a} + .map{tuple('id': exp_meta.id, it[1])} ch_in_norm = PROTEUS.out.norm_tab + .reduce{a, b -> a} + .map{tuple('id': exp_meta.id, it[1])} + ch_versions = ch_versions.mix(PROTEUS.out.versions) } else if(params.study_type == 'geo_soft_file'){ @@ -227,11 +224,11 @@ workflow DIFFERENTIALABUNDANCE { } else if (params.study_type == 'maxquant'){ // For maxquant, we will use the processed matrices from PROTEUS - ch_features = PROTEUS.out.norm_tab.map{ - matrix_as_anno_filename = "matrix_as_anno.${it[1].getExtension()}" + ch_features = ch_in_norm + .map{ + matrix_as_anno_filename = "${it[1].getParent()}/matrix_as_anno.${it[1].getExtension()}" it[1].copyTo(matrix_as_anno_filename) // copy normalized outfile to use as fake "annotation" - it[1] = file(matrix_as_anno_filename) - it + it = tuple(it[0], file(matrix_as_anno_filename)) } } else if(params.study_type == 'geo_soft_file') { ch_features = ch_soft_features @@ -269,12 +266,7 @@ workflow DIFFERENTIALABUNDANCE { ch_features = Channel.of([ exp_meta, file(matrix_as_anno_filename)]) } - // Channel for the contrasts file - - ch_contrasts_file = Channel.from([[exp_meta, file(params.contrasts)]]) - // Check compatibility of FOM elements and contrasts - if (params.study_type == 'affy_array' || params.study_type == 'maxquant'){ ch_matrices_for_validation = ch_in_raw .join(ch_in_norm) From 70ad1c986161901b71fb2dc9990a94060e967593 Mon Sep 17 00:00:00 2001 From: WackerO Date: Wed, 4 Oct 2023 08:33:09 +0200 Subject: [PATCH 21/30] Added proteus params table to report, renamed some params --- assets/differentialabundance_report.Rmd | 11 +++++ conf/modules.config | 6 +-- docs/output.md | 8 ++-- .../limma/differential/templates/limma_de.R | 3 -- .../templates/proteus_readproteingroups.R | 45 ++++++++----------- nextflow.config | 4 +- nextflow_schema.json | 9 ++-- workflows/differentialabundance.nf | 2 +- 8 files changed, 44 insertions(+), 44 deletions(-) diff --git a/assets/differentialabundance_report.Rmd b/assets/differentialabundance_report.Rmd index ecd3b4fd..957e00c8 100644 --- a/assets/differentialabundance_report.Rmd +++ b/assets/differentialabundance_report.Rmd @@ -41,6 +41,12 @@ params: variance_stabilised_matrix: null # e.g. test_files/3_treatment-WT-P23H.vst.tsv contrasts_file: null # e.g. GSE156533.contrasts.csv differential_table: file.csv + proteus_measurecol_prefix: NULL + proteus_norm_function: NULL + proteus_plotsd_method: NULL + proteus_plotmv_loess: NULL + proteus_palette_name: NULL + proteus_round_digits: NULL affy_cel_files_archive: NULL affy_file_name_col: NULL affy_background: NULL @@ -862,6 +868,11 @@ if (any(unlist(params[paste0(possible_gene_set_methods, '_run')]))){ # Methods +```{r, echo=FALSE, results='asis', eval=params$study_type == 'maxquant'} +cat(paste0("\n## Protein abundance import\n")) +make_params_table('importing maxquant output', 'proteus_', remove_pattern = TRUE) +``` + ## Filtering ```{r, echo=FALSE, results='asis'} diff --git a/conf/modules.config b/conf/modules.config index 840488f4..58318a9d 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -129,9 +129,9 @@ process { "--sample_id_col \"${params.observations_id_col}\"", "--protein_id_col \"${params.features_id_col}\"", "--measure_col_prefix \"${params.proteus_measurecol_prefix}\"", - "--normfun $params.proteus_norm_function", - "--plotSampleDistributions_method $params.proteus_plotSD_method", - "--plotMV_loess $params.proteus_plotMV_loess", + "--norm_function $params.proteus_norm_function", + "--plotsd_method $params.proteus_plotsd_method", + "--plotmv_loess $params.proteus_plotmv_loess", "--palette_name $params.proteus_palette_name", "--round_digits $params.proteus_round_digits" ].join(' ').trim() } diff --git a/docs/output.md b/docs/output.md index ea8fa0cf..4ab43010 100644 --- a/docs/output.md +++ b/docs/output.md @@ -38,9 +38,9 @@ Stand-alone graphical outputs are placed in this directory. They may be useful i - `gsea/`: Directory containing graphical outputs from GSEA (where enabled). Plots are stored in directories named for the associated contrast. - `[contrast]/png/[gsea_plot_type].png` - `proteus/`: If `--study_type maxquant`: Directory containing plots produced by the proteus module which is used for processing MaxQuant input. Files are prefixed with the associated contrast and chosen normalization function (if any). - - `[contrast].proteus.[normfun].normalized_dendrogram.png`: A sample clustering dendrogram after normalization, if chosen. - - `[contrast].proteus.[normfun].normalized_mean_variance_relationship.png`: Plots of log intensity vs mean log intensity after normalization of each contrast level, if chosen. - - `[contrast].proteus.[normfun].normalized_distributions.png`: A plot of sample distributions after normalization, if chosen. + - `[contrast].proteus.[norm_function].normalized_dendrogram.png`: A sample clustering dendrogram after normalization, if chosen. + - `[contrast].proteus.[norm_function].normalized_mean_variance_relationship.png`: Plots of log intensity vs mean log intensity after normalization of each contrast level, if chosen. + - `[contrast].proteus.[norm_function].normalized_distributions.png`: A plot of sample distributions after normalization, if chosen. - `[contrast].proteus.raw_distributions.png`: A plot of sample distributions without normalization. @@ -67,7 +67,7 @@ Most plots are included in the HTML report (see above), but are also included in - `gsea/`: Directory containing tables of differential gene set analyis from GSEA (where enabled) - `[contrast]/[contrast].gsea_report_for_[condition].tsv`: A GSEA report table for each side of each contrast - `proteus/`: If `--study_type maxquant`: Directory containing abundance values produced by the proteus module which is used for processing MaxQuant input. Files are prefixed with the associated contrast and chosen normalization function (if any). - - `[contrast].proteus.[normfun].normalized_proteingroups_tab.tsv`: Abundance table after normalization, if chosen. + - `[contrast].proteus.[norm_function].normalized_proteingroups_tab.tsv`: Abundance table after normalization, if chosen. - `[contrast].proteus.raw_proteingroups_tab.tsv`: Abundance table without normalization. diff --git a/modules/nf-core/limma/differential/templates/limma_de.R b/modules/nf-core/limma/differential/templates/limma_de.R index 6acad566..5a80eb22 100755 --- a/modules/nf-core/limma/differential/templates/limma_de.R +++ b/modules/nf-core/limma/differential/templates/limma_de.R @@ -149,9 +149,6 @@ library(limma) ################################################ ################################################ -file.copy(opt\$count_file, "/home-link/iivow01/git/differentialabundance/error_limma/count_file") -file.copy(opt\$sample_file, "/home-link/iivow01/git/differentialabundance/error_limma/sample_file") - intensities.table <- read_delim_flexible( file = opt\$count_file, diff --git a/modules/nf-core/proteus/readproteingroups/templates/proteus_readproteingroups.R b/modules/nf-core/proteus/readproteingroups/templates/proteus_readproteingroups.R index f335c0aa..5806971d 100644 --- a/modules/nf-core/proteus/readproteingroups/templates/proteus_readproteingroups.R +++ b/modules/nf-core/proteus/readproteingroups/templates/proteus_readproteingroups.R @@ -128,9 +128,9 @@ opt <- list( protein_id_col = 'Majority protein IDs', sample_id_col = 'sample', measure_col_prefix = 'intensities', - normfun = 'normalizeMedian', - plotSampleDistributions_method = 'violin', - plotMV_loess = T, + norm_function = 'normalizeMedian', + plotsd_method = 'violin', + plotmv_loess = T, palette_name = 'Set1', round_digits = -1 ) @@ -250,46 +250,46 @@ proteinGroups <- readProteinGroups( # Define valid normalization functions -valid_normfuns <- list("normalizeMedian", "normalizeQuantiles") +valid_norm_functions <- list("normalizeMedian", "normalizeQuantiles") # Generate plots for requested normalization; also, save normalized protein groups for limma -if (! (opt\$normfun %in% valid_normfuns)) { - stop(paste0("Invalid normfun argument: ", opt\$normfun, - ". Valid normfuns are: ", paste(valid_normfuns, collapse=", "), ".")) +if (! (opt\$norm_function %in% valid_norm_functions)) { + stop(paste0("Invalid norm_function argument: ", opt\$norm_function, + ". Valid norm_functions are: ", paste(valid_norm_functions, collapse=", "), ".")) } -proteinGroups.normalized <- normalizeData(proteinGroups, norm.fun = eval(parse(text=opt\$normfun))) # Proteus also accepts other norm.funs, e.g. from limma +proteinGroups.normalized <- normalizeData(proteinGroups, norm.fun = eval(parse(text=opt\$norm_function))) # Proteus also accepts other norm.funs, e.g. from limma proteinGroups.normalized\$tab <- log2(proteinGroups.normalized\$tab) -png(paste(opt\$normfun, 'normalized_distributions.png', sep='.'), width = 5*300, height = 5*300, res = 300, pointsize = 8) +png(paste(opt\$norm_function, 'normalized_distributions.png', sep='.'), width = 5*300, height = 5*300, res = 300, pointsize = 8) print( - plotSampleDistributions(proteinGroups.normalized, title=paste0("Sample distributions after applying\n", opt\$normfun, " in contrast ", opt\$contrast_variable), fill="condition", method=opt\$plotSampleDistributions_method) + plotSampleDistributions(proteinGroups.normalized, title=paste0("Sample distributions after applying\n", opt\$norm_function, " in contrast ", opt\$contrast_variable), fill="condition", method=opt\$plotsd_method) + scale_fill_brewer(palette=opt\$palette_name, name=opt\$contrast_variable) + theme(plot.title = element_text(size = 12)) ) dev.off() -png(paste(opt\$normfun, 'normalized_mean_variance_relationship.png', sep='.'), width = 5*300, height = 5*300, res = 300, pointsize = 8) +png(paste(opt\$norm_function, 'normalized_mean_variance_relationship.png', sep='.'), width = 5*300, height = 5*300, res = 300, pointsize = 8) print( - plotMV(proteinGroups.normalized, with.loess=opt\$plotMV_loess) - + ggtitle(paste0("Sample mean variance relationship after applying\n", opt\$normfun, " in contrast ", opt\$contrast_variable)) + plotMV(proteinGroups.normalized, with.loess=opt\$plotmv_loess) + + ggtitle(paste0("Sample mean variance relationship after applying\n", opt\$norm_function, " in contrast ", opt\$contrast_variable)) + scale_fill_distiller(palette=opt\$palette_name) + theme(plot.title = element_text(size = 12)) ) dev.off() -png(paste(opt\$normfun, 'normalized_dendrogram.png', sep='.'), width = 5*300, height = 5*300, res = 300, pointsize = 8) +png(paste(opt\$norm_function, 'normalized_dendrogram.png', sep='.'), width = 5*300, height = 5*300, res = 300, pointsize = 8) print( plotClustering(proteinGroups.normalized) - + ggtitle(paste0("Sample clustering after applying\n", opt\$normfun, " in contrast ", opt\$contrast_variable)) + + ggtitle(paste0("Sample clustering after applying\n", opt\$norm_function, " in contrast ", opt\$contrast_variable)) + theme(plot.title = element_text(size = 12)) ) dev.off() # R object for other processes to use -saveRDS(proteinGroups.normalized, file = paste(opt\$normfun, 'normalized_proteingroups.rds', sep='.')) +saveRDS(proteinGroups.normalized, file = paste(opt\$norm_function, 'normalized_proteingroups.rds', sep='.')) # Write normalized intensities matrix @@ -301,7 +301,7 @@ out_df[[opt\$protein_id_col]] <- rownames(proteinGroups.normalized\$tab) # prote out_df <- out_df[c(opt\$protein_id_col, colnames(out_df)[colnames(out_df) != opt\$protein_id_col])] # move ID column to first position write.table( out_df, - file = paste(opt\$normfun, 'normalized_proteingroups_tab', 'tsv', sep = '.'), + file = paste(opt\$norm_function, 'normalized_proteingroups_tab', 'tsv', sep = '.'), col.names = TRUE, row.names = FALSE, sep = '\t', @@ -316,7 +316,7 @@ proteinGroups\$tab <- log2(proteinGroups\$tab) png('raw_distributions.png', width = 5*300, height = 5*300, res = 300, pointsize = 8) print( - plotSampleDistributions(proteinGroups, title=paste("Raw sample distributions in contrast", opt\$contrast_variable), fill="condition", method=opt\$plotSampleDistributions_method) + plotSampleDistributions(proteinGroups, title=paste("Raw sample distributions in contrast", opt\$contrast_variable), fill="condition", method=opt\$plotsd_method) + scale_fill_brewer(palette=opt\$palette_name, name=opt\$contrast_variable) + theme(plot.title = element_text(size = 12)) ) @@ -346,15 +346,6 @@ write.table( quote = FALSE ) -write.table( - out_df, - file = '/home-link/iivow01/git/save_differentialabundance/error/raw_proteingroups_tab.tsv', - col.names = TRUE, - row.names = FALSE, - sep = '\t', - quote = FALSE -) - ################################################ ################################################ ## R SESSION INFO ## diff --git a/nextflow.config b/nextflow.config index a1b034da..48f30f5a 100644 --- a/nextflow.config +++ b/nextflow.config @@ -61,8 +61,8 @@ params { // Proteus-specific options proteus_measurecol_prefix = 'LFQ intensity ' proteus_norm_function = 'normalizeMedian' - proteus_plotSD_method = 'violin' - proteus_plotMV_loess = true + proteus_plotsd_method = 'violin' + proteus_plotmv_loess = true proteus_palette_name = 'Set1' proteus_round_digits = -1 diff --git a/nextflow_schema.json b/nextflow_schema.json index a370976c..5a812165 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -269,17 +269,18 @@ "proteus_norm_function": { "type": "string", "default": "normalizeMedian", - "description": "Comma-separated string of normalization functions to use on the MaxQuant intensities.", - "help_text": "'normalizeMedian', 'normalizeQuantiles' or any comma-separated combination thereof" + "description": "Normalization function to use on the MaxQuant intensities.", + "help_text": "'normalizeMedian' or 'normalizeQuantiles'", + "enum": ["normalizeMedian", "normalizeQuantiles"] }, - "proteus_plotSD_method": { + "proteus_plotsd_method": { "type": "string", "default": "violin", "description": "Which method to use for plotting sample distributions of the MaxQuant intensities; one of 'violin', 'dist', 'box'.", "help_text": "'violin', 'dist' or 'box'", "enum": ["violin", "dist", "box"] }, - "proteus_plotMV_loess": { + "proteus_plotmv_loess": { "type": "boolean", "default": true, "description": "Should a loess line be added to the plot of mean-variance relationship of the conditions? Default: true." diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index 83bb59ce..7871329c 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -559,7 +559,7 @@ workflow DIFFERENTIALABUNDANCE { def params_pattern = ~/^(report|study|observations|features|filtering|exploratory|differential|deseq2|gsea).*/ if (params.study_type == 'affy_array' || params.study_type == 'geo_soft_file' || params.study_type == 'maxquant'){ - params_pattern = ~/^(report|study|observations|features|filtering|exploratory|differential|affy|limma|gsea).*/ + params_pattern = ~/^(report|study|observations|features|filtering|exploratory|differential|proteus|affy|limma|gsea).*/ } ch_report_params = ch_report_input_files From b6ed728ff001df2a4b4703401cd47dd9339a6db7 Mon Sep 17 00:00:00 2001 From: WackerO Date: Wed, 4 Oct 2023 13:52:20 +0200 Subject: [PATCH 22/30] updated proteus module --- modules.json | 2 +- modules/nf-core/proteus/readproteingroups/meta.yml | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/modules.json b/modules.json index b5ed0aa9..bca82e93 100644 --- a/modules.json +++ b/modules.json @@ -62,7 +62,7 @@ }, "proteus/readproteingroups": { "branch": "master", - "git_sha": "007dd9c990670392d3fb6607529966a1a614e1e1", + "git_sha": "685765c4a5e3423d20f74aa9c4405ef0b8c4748d", "installed_by": ["modules"] }, "rmarkdownnotebook": { diff --git a/modules/nf-core/proteus/readproteingroups/meta.yml b/modules/nf-core/proteus/readproteingroups/meta.yml index 02031d9c..bed3dc68 100644 --- a/modules/nf-core/proteus/readproteingroups/meta.yml +++ b/modules/nf-core/proteus/readproteingroups/meta.yml @@ -33,8 +33,7 @@ input: - contrast_variable: type: string description: | - The column in the sample sheet that should be used to define groups for - comparison + The column in the sample sheet that should be used to define groups for comparison output: - dendro_plot: From a73d256179e849e81b9283f8335b94282ed095d2 Mon Sep 17 00:00:00 2001 From: WackerO <43847497+WackerO@users.noreply.github.com> Date: Mon, 9 Oct 2023 12:49:25 +0200 Subject: [PATCH 23/30] Update docs/usage.md Co-authored-by: Jonathan Manning --- docs/usage.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/usage.md b/docs/usage.md index c8700119..dcdcd7e4 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -14,7 +14,7 @@ With the above in mind, running this workflow requires: - (for RNA-seq or MaxQuant proteomics measurements): a matrix of quantifications with observations by column and features by row - (for Affymetrix microarrays): a tar'd archive of CEL files - a description of the observations such as a sample sheet from RNA-seq analysis -- a description of the features (skip for MaxQuant), for our initial RNA-seq application this can be simply the GTF file from which gene annotations can be derived. For Affymetrix arrays this can be derived from the array platform annotation package automatically. You can also supply your own table. +- a description of the features, for our initial RNA-seq application this can be simply the GTF file from which gene annotations can be derived. For Affymetrix arrays this can be derived from the array platform annotation package automatically. Skip for MaxQuant. You can also supply your own table. - a specification of how the matrix should be split, and how the resulting groups should be compared ## Observations (samplesheet) input From 3e6c8fed4d1df19e91b6d5f47987ea3cd24457fc Mon Sep 17 00:00:00 2001 From: WackerO <43847497+WackerO@users.noreply.github.com> Date: Mon, 9 Oct 2023 12:50:22 +0200 Subject: [PATCH 24/30] Update workflows/differentialabundance.nf Co-authored-by: Jonathan Manning --- workflows/differentialabundance.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index 2372c92c..6ce7a94f 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -33,7 +33,7 @@ if (params.study_type == 'affy_array'){ // Should the user have enabled --shinyngs_build_app and/or --gsea_run, throw an error if (params.shinyngs_build_app) { // This can be removed once shinyngs has an inbuilt NA handler - error("Cannot build shinyngs app for maxquant data; please set --shinyngs_build_app to false.") + error("Cannot currently build shinyngs app for maxquant data due to data sparsity; please set --shinyngs_build_app to false.") } if (params.gsea_run) { error("Cannot run GSEA for maxquant data; please set --gsea_run to false.") From d4751ceed50a2eb800d1b3c68d76412f95454a34 Mon Sep 17 00:00:00 2001 From: WackerO <43847497+WackerO@users.noreply.github.com> Date: Mon, 9 Oct 2023 12:50:49 +0200 Subject: [PATCH 25/30] Update workflows/differentialabundance.nf Co-authored-by: Jonathan Manning --- workflows/differentialabundance.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index 6ce7a94f..f9b51b34 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -293,7 +293,7 @@ workflow DIFFERENTIALABUNDANCE { .transpose() .branch { raw: it[1].name.contains('raw') - normalised: it[1].name.contains('normalised') || it[1].name.contains('normalized') + normalised: it[1].name =~ /normali[sz]ed/ } ch_raw = ch_validated_assays.raw ch_norm = ch_validated_assays.normalised From 42a4acc8044bfe73b987185f0402e07ea1b22758 Mon Sep 17 00:00:00 2001 From: WackerO Date: Mon, 9 Oct 2023 13:25:03 +0200 Subject: [PATCH 26/30] Working on review --- assets/differentialabundance_report.Rmd | 57 ++----------------------- conf/maxquant.config | 6 +-- conf/test_maxquant.config | 11 ----- workflows/differentialabundance.nf | 46 ++++++++++---------- 4 files changed, 31 insertions(+), 89 deletions(-) diff --git a/assets/differentialabundance_report.Rmd b/assets/differentialabundance_report.Rmd index 5d776c17..f3a0b1d5 100644 --- a/assets/differentialabundance_report.Rmd +++ b/assets/differentialabundance_report.Rmd @@ -241,52 +241,6 @@ assay_names <- simpleSplit(params$exploratory_assay_names) names(assay_names) = assay_names assay_files <- lapply(assay_names, function(x) params[[paste0(x, '_matrix')]]) -# Set up vector of unlogged assay files (if any) -unlogged <- c() -features_log2_assays <- params$features_log2_assays - -# For maxquant input override param as all assays are already logged by proteus -if (params$study_type == 'maxquant') { - features_log2_assays <- "" -} - -if (is.null(features_log2_assays)) { - # Guess unlogged assays - for (assay in c(1:length(assay_files))) { - if (max(assay_files[[assay]]) > 20) { - unlogged <- append(unlogged, assay) - } - } - -} else { - features_log2_assays <- gsub('\\]$', '', gsub('^\\[', '', features_log2_assays)) # Remove brackets from assay list - - if (features_log2_assays != "") { - - if (is_valid_positive_integer_vector(features_log2_assays)) { - # Convert to list of assay positions - unlogged <- unique(as.integer(simpleSplit(features_log2_assays))) - invalid_assays <- unlogged[! unlogged %in% 1:length(assay_files)] - - if (length(invalid_assays) > 0){ - stop(paste0("Invalid assay numbers: ", paste(invalid_assays, collapse=', '))) - } - - } else { - # Last option is string of assay names, so get positions of those names in the assay list - unlogged <- unique(simpleSplit(features_log2_assays)) - - # Check if all names are valid - invalid_assays <- unlogged[!(unlogged %in% names(assay_files))] - if (length(invalid_assays) > 0) { - stop(paste0(invalid_assays, " is/are not valid assay name(s). Valid assay names are: ", paste(names(assay_files), collapse=", "), ". Please check param --features_log2_assays.")) - } - unlogged <- match(unlogged, names(assay_files)) - } - - } -} - assay_data <- lapply(c(1:length(assay_files)), function(x) { mat <- na.omit( read_matrix( @@ -296,15 +250,12 @@ assay_data <- lapply(c(1:length(assay_files)), function(x) { ) ) colnames(mat) <- observations[[params$observations_name_col]][match(colnames(mat), rownames(observations))] - - # If assay is in unlogged list, apply log2 - if (x %in% unlogged) { - mat <- log2(mat+1) - } - mat - }) +# Remove brackets from assay list. TODO: Remove if this is added to cond_log2_transform_assays +params$features_log2_assays <- gsub('\\]$', '', gsub('^\\[', '', features_log2_assays)) +assay_data <- cond_log2_transform_assays(assay_data, params$features_log2_assays) + # Lapply does not go over the assay_files themselves anymore, so we need to specifically assign their names to the data names(assay_data) <- names(assay_files) diff --git a/conf/maxquant.config b/conf/maxquant.config index 7da526fb..a5f2b2d2 100644 --- a/conf/maxquant.config +++ b/conf/maxquant.config @@ -26,8 +26,8 @@ params { features_type = 'protein' // Exploratory - exploratory_assay_names = "raw,normalised,variance_stabilised" - exploratory_final_assay = "variance_stabilised" + exploratory_assay_names = "raw,normalised" + exploratory_final_assay = "normalised" // Differential options differential_file_suffix = ".limma.results.tsv" @@ -35,7 +35,7 @@ params { differential_pval_column = "P.Value" differential_qval_column = "adj.P.Val" differential_feature_id_column = "probe_id" - differential_feature_name_column = "probe_id" + differential_feature_name_column = "Majority protein IDs" // Shiny does not work for this datatype shinyngs_build_app = false diff --git a/conf/test_maxquant.config b/conf/test_maxquant.config index d5a7dcf8..8afe4929 100644 --- a/conf/test_maxquant.config +++ b/conf/test_maxquant.config @@ -28,10 +28,6 @@ params { matrix = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/proteomics/maxquant/MaxQuant_proteinGroups.txt' contrasts = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/proteomics/maxquant/MaxQuant_contrasts.csv' - // Features - features_id_col = 'Majority protein IDs' - features_metadata_cols = 'Majority protein IDs' - // Observations observations_id_col = 'Experiment' observations_name_col = 'Name' @@ -39,12 +35,5 @@ params { // Exploratory exploratory_main_variable = 'Celltype' - exploratory_assay_names = 'raw,normalised' exploratory_final_assay = 'normalised' - - // Differential - differential_feature_id_column = 'probe_id' - differential_feature_name_column = 'Majority protein IDs' - differential_fc_column = 'logFC' - differential_qval_column = 'adj.P.Val' } diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index 650684cd..dfd349ff 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -179,30 +179,29 @@ workflow DIFFERENTIALABUNDANCE { } else if (params.study_type == 'maxquant'){ - // Split contrasts for proteus and for the later modules - ch_contrasts_split = ch_contrasts_file + // We'll be running Proteus once per unique contrast variable to generate plots + // TODO: there should probably be a separate plotting module in proteus to simplify this + + ch_contrast_variables = ch_contrasts_file .splitCsv ( header:true, sep:(params.contrasts.endsWith('tsv') ? '\t' : ',')) .map{ it.tail().first() } - - // For proteus, extract only contrast variable as the module has to run once per contrast - ch_contrasts_proteus = ch_contrasts_split .map{ tuple('id': it.variable) } - .unique() + .unique() // uniquify to keep each contrast variable only once (in case it exists in multiple lines for blocking etc.) // Run proteus to import protein abundances PROTEUS( - ch_contrasts_proteus.combine(proteus_in) + ch_contrast_variables.combine(proteus_in) ) // Re-map the proteus output tables to the study ID as the tables are the same across contrasts ch_in_raw = PROTEUS.out.raw_tab - .reduce{a, b -> a} - .map{tuple('id': exp_meta.id, it[1])} + .first() + .map{ meta, matrix -> tuple(exp_meta, matrix) } ch_in_norm = PROTEUS.out.norm_tab - .reduce{a, b -> a} - .map{tuple('id': exp_meta.id, it[1])} + .first() + .map{ meta, matrix -> tuple(exp_meta, matrix) } ch_versions = ch_versions.mix(PROTEUS.out.versions) } else if(params.study_type == 'geo_soft_file'){ @@ -221,15 +220,6 @@ workflow DIFFERENTIALABUNDANCE { ch_features = Channel.of([ exp_meta, file(params.features, checkIfExists: true)]) } else if (params.study_type == 'affy_array'){ ch_features = ch_affy_platform_features - } else if (params.study_type == 'maxquant'){ - - // For maxquant, we will use the processed matrices from PROTEUS - ch_features = ch_in_norm - .map{ - matrix_as_anno_filename = "${it[1].getParent()}/matrix_as_anno.${it[1].getExtension()}" - it[1].copyTo(matrix_as_anno_filename) // copy normalized outfile to use as fake "annotation" - it = tuple(it[0], file(matrix_as_anno_filename)) - } } else if(params.study_type == 'geo_soft_file') { ch_features = ch_soft_features } else if (params.gtf){ @@ -262,10 +252,22 @@ workflow DIFFERENTIALABUNDANCE { // Otherwise we can just use the matrix input matrix_as_anno_filename = "matrix_as_anno.${matrix_file.getExtension()}" - matrix_file.copyTo(matrix_as_anno_filename) - ch_features = Channel.of([ exp_meta, file(matrix_as_anno_filename)]) + if (params.study_type == 'maxquant'){ + ch_features_matrix = ch_in_norm + } else { + ch_features_matrix = ch_in_raw + } + ch_features = ch_features_matrix + .map{ exp_meta, matrix_file -> + matrix_file.copyTo(matrix_as_anno_filename) + [ exp_meta, file(matrix_as_anno_filename) ] + } } + + + + // Check compatibility of FOM elements and contrasts if (params.study_type == 'affy_array' || params.study_type == 'maxquant'){ ch_matrices_for_validation = ch_in_raw From fa1527438a577b3043a9a64dffe23c0245e91712 Mon Sep 17 00:00:00 2001 From: WackerO Date: Mon, 9 Oct 2023 15:03:24 +0200 Subject: [PATCH 27/30] Finished the review changes, pipeline runs locally --- CHANGELOG.md | 5 +++-- assets/differentialabundance_report.Rmd | 10 ++++++---- conf/maxquant.config | 3 +++ conf/modules.config | 4 ++-- conf/test_maxquant.config | 2 -- modules/nf-core/shinyngs/app/main.nf | 6 +++--- workflows/differentialabundance.nf | 18 ++++++++++-------- 7 files changed, 27 insertions(+), 21 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1ca121b8..53cdb045 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,7 +14,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [# 136](https://github.com/nf-core/differentialabundance/pull/136)] - Added support for non-Affymetrix arrays via automatic download of SOFT matrices in GEO ([@azedinez](https://github.com/azedinez), review by [@pinin4fjords](https://github.com/pinin4fjords)) - [[#137](https://github.com/nf-core/differentialabundance/pull/137)] - Add `--sizefactors_from_controls` and `--gene_id_col` for DESeq2 module to modules.config ([@WackerO](https://github.com/WackerO), review by [@pinin4fjords](https://github.com/pinin4fjords)) - [[#145](https://github.com/nf-core/differentialabundance/pull/145)] - Template update for nf-core/tools v2.9 ([@nf-core-bot](https://github.com/nf-core-bot), review by [@pinin4fjords](https://github.com/pinin4fjords), [@WackerO](https://github.com/WackerO)) -- [[#147](https://github.com/nf-core/differentialabundance/pull/147)] - Add Maxquant analysis module ([@WackerO](https://github.com/WackerO), review by) +- [[#147](https://github.com/nf-core/differentialabundance/pull/147)] - Add Maxquant analysis module ([@WackerO](https://github.com/WackerO), review by [@pinin4fjords](https://github.com/pinin4fjords)) ### `Fixed` @@ -29,8 +29,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Changed` - [[#159](https://github.com/nf-core/differentialabundance/issues/159)] - CUSTOM/MATRIXFILTER module update ([@WackerO](https://github.com/WackerO), review by [@suzannejin](https://github.com/suzannejin)) -- [[#152](https://github.com/nf-core/differentialabundance/issues/152)] - RMARKDOWNNOTEBOOK env update ([@WackerO](https://github.com/WackerO), review by [@pinin4fjords](https://github.com/pinin4fjords)) +- [[#154](https://github.com/nf-core/differentialabundance/issues/154)] - RMARKDOWNNOTEBOOK env update ([@WackerO](https://github.com/WackerO), review by [@pinin4fjords](https://github.com/pinin4fjords)) - [[#151](https://github.com/nf-core/differentialabundance/issues/151)] - Module update ([@WackerO](https://github.com/WackerO), review by [@pinin4fjords](https://github.com/pinin4fjords)) +- [[#147](https://github.com/nf-core/differentialabundance/pull/147)] - RMARKDOWNNOTEBOOK env update ([@WackerO](https://github.com/WackerO), review by [@pinin4fjords](https://github.com/pinin4fjords)) ## v1.2.0 - 2023-04-19 diff --git a/assets/differentialabundance_report.Rmd b/assets/differentialabundance_report.Rmd index 3562feda..669f0514 100644 --- a/assets/differentialabundance_report.Rmd +++ b/assets/differentialabundance_report.Rmd @@ -250,11 +250,14 @@ assay_data <- lapply(c(1:length(assay_files)), function(x) { ) ) colnames(mat) <- observations[[params$observations_name_col]][match(colnames(mat), rownames(observations))] + mat }) -# Remove brackets from assay list. TODO: Remove if this is added to cond_log2_transform_assays -params$features_log2_assays <- gsub('\\]$', '', gsub('^\\[', '', features_log2_assays)) -assay_data <- cond_log2_transform_assays(assay_data, params$features_log2_assays) +if (!is.null(params$features_log2_assays)) { + # Remove brackets from assay list. TODO: Remove if this is added to cond_log2_transform_assays + features_log2_assays <- gsub('\\]$', '', gsub('^\\[', '', params$features_log2_assays)) + assay_data <- cond_log2_transform_assays(assay_data, features_log2_assays) +} # Lapply does not go over the assay_files themselves anymore, so we need to specifically assign their names to the data names(assay_data) <- names(assay_files) @@ -263,7 +266,6 @@ names(assay_data) <- names(assay_files) rownames(observations) <- observations[[params$observations_name_col]] # Run PCA early so we can understand how important each variable is - pca_datas <- lapply(names(assay_data), function(assay_type){ compilePCAData(assay_data[[assay_type]]) }) diff --git a/conf/maxquant.config b/conf/maxquant.config index a5f2b2d2..c2f081ca 100644 --- a/conf/maxquant.config +++ b/conf/maxquant.config @@ -37,6 +37,9 @@ params { differential_feature_id_column = "probe_id" differential_feature_name_column = "Majority protein IDs" + // Proteus options + proteus_measurecol_prefix = 'LFQ intensity ' + // Shiny does not work for this datatype shinyngs_build_app = false } diff --git a/conf/modules.config b/conf/modules.config index 58318a9d..37798f8d 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -390,8 +390,8 @@ process { } withName: RMARKDOWNNOTEBOOK { - conda = "bioconda::r-shinyngs=1.8.1" - container = { "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/r-shinyngs:1.8.1--r43hdfd78af_0' : 'quay.io/biocontainers/r-shinyngs:1.8.1--r43hdfd78af_0' }" } + conda = "bioconda::r-shinyngs=1.8.2" + container = { "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/r-shinyngs:1.8.2--r43hdfd78af_0' : 'quay.io/biocontainers/r-shinyngs:1.8.2--r43hdfd78af_0' }" } publishDir = [ path: { "${params.outdir}/report" }, mode: params.publish_dir_mode, diff --git a/conf/test_maxquant.config b/conf/test_maxquant.config index 8afe4929..89d31cca 100644 --- a/conf/test_maxquant.config +++ b/conf/test_maxquant.config @@ -31,9 +31,7 @@ params { // Observations observations_id_col = 'Experiment' observations_name_col = 'Name' - proteus_measurecol_prefix = 'LFQ intensity ' // Exploratory exploratory_main_variable = 'Celltype' - exploratory_final_assay = 'normalised' } diff --git a/modules/nf-core/shinyngs/app/main.nf b/modules/nf-core/shinyngs/app/main.nf index 60ea6a79..0822b251 100644 --- a/modules/nf-core/shinyngs/app/main.nf +++ b/modules/nf-core/shinyngs/app/main.nf @@ -13,10 +13,10 @@ process SHINYNGS_APP { // // Those values must then be set in your Nextflow secrets. - conda "bioconda::r-shinyngs=1.8.1" + conda "bioconda::r-shinyngs=1.8.2" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/r-shinyngs:1.8.1--r43hdfd78af_0' : - 'biocontainers/r-shinyngs:1.8.1--r43hdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/r-shinyngs:1.8.2--r43hdfd78af_0' : + 'biocontainers/r-shinyngs:1.8.2--r43hdfd78af_0' }" input: tuple val(meta), path(sample), path(feature_meta), path(assay_files) // Experiment-level info diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index 3fed3c93..8e6096ef 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -41,9 +41,10 @@ if (params.study_type == 'affy_array'){ if (!params.matrix) { error("Input matrix not specified!") } - + matrix_file = file(params.matrix, checkIfExists: true) + // Make channel for proteus - proteus_in = Channel.of([ file(params.input), file(params.matrix) ]) + proteus_in = Channel.of([ file(params.input), matrix_file ]) } else if (params.study_type == 'geo_soft_file'){ // To pull SOFT files from a GEO a GSE study identifer must be provided @@ -195,7 +196,7 @@ workflow DIFFERENTIALABUNDANCE { ch_contrast_variables.combine(proteus_in) ) - // Re-map the proteus output tables to the study ID as the tables are the same across contrasts + // Re-map the proteus output tables to the study ID as the tables are the same across contrasts, only one norm table will be necessary ch_in_raw = PROTEUS.out.raw_tab .first() .map{ meta, matrix -> tuple(exp_meta, matrix) } @@ -250,17 +251,18 @@ workflow DIFFERENTIALABUNDANCE { } else{ - // Otherwise we can just use the matrix input - matrix_as_anno_filename = "matrix_as_anno.${matrix_file.getExtension()}" + // Otherwise we can just use the matrix input; save it to the workdir so that it does not + // just appear wherever the user runs the pipeline + matrix_as_anno_filename = "${workflow.workDir}/matrix_as_anno.${matrix_file.getExtension()}" if (params.study_type == 'maxquant'){ ch_features_matrix = ch_in_norm } else { ch_features_matrix = ch_in_raw } ch_features = ch_features_matrix - .map{ exp_meta, matrix_file -> - matrix_file.copyTo(matrix_as_anno_filename) - [ exp_meta, file(matrix_as_anno_filename) ] + .map{ meta, matrix -> + matrix.copyTo(matrix_as_anno_filename) + [ meta, file(matrix_as_anno_filename) ] } } From b103a552d3980de4555e17e4d483ec112a932ff1 Mon Sep 17 00:00:00 2001 From: WackerO Date: Mon, 9 Oct 2023 15:13:34 +0200 Subject: [PATCH 28/30] Restored shinyngs module --- modules/nf-core/shinyngs/app/main.nf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modules/nf-core/shinyngs/app/main.nf b/modules/nf-core/shinyngs/app/main.nf index 0822b251..60ea6a79 100644 --- a/modules/nf-core/shinyngs/app/main.nf +++ b/modules/nf-core/shinyngs/app/main.nf @@ -13,10 +13,10 @@ process SHINYNGS_APP { // // Those values must then be set in your Nextflow secrets. - conda "bioconda::r-shinyngs=1.8.2" + conda "bioconda::r-shinyngs=1.8.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/r-shinyngs:1.8.2--r43hdfd78af_0' : - 'biocontainers/r-shinyngs:1.8.2--r43hdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/r-shinyngs:1.8.1--r43hdfd78af_0' : + 'biocontainers/r-shinyngs:1.8.1--r43hdfd78af_0' }" input: tuple val(meta), path(sample), path(feature_meta), path(assay_files) // Experiment-level info From 700796584db6a2872fd49a911d8edbac739fbb91 Mon Sep 17 00:00:00 2001 From: WackerO Date: Mon, 9 Oct 2023 15:24:29 +0200 Subject: [PATCH 29/30] module updates --- modules.json | 8 ++++---- modules/nf-core/custom/dumpsoftwareversions/main.nf | 6 +++--- modules/nf-core/shinyngs/staticdifferential/main.nf | 6 +++--- modules/nf-core/shinyngs/staticexploratory/main.nf | 6 +++--- modules/nf-core/shinyngs/validatefomcomponents/main.nf | 6 +++--- workflows/differentialabundance.nf | 4 ---- 6 files changed, 16 insertions(+), 20 deletions(-) diff --git a/modules.json b/modules.json index d3d6f1a2..dcf9482f 100644 --- a/modules.json +++ b/modules.json @@ -17,7 +17,7 @@ }, "custom/dumpsoftwareversions": { "branch": "master", - "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "git_sha": "05c280924b6c768d484c7c443dad5e605c4ff4b4", "installed_by": ["modules"] }, "custom/matrixfilter": { @@ -77,17 +77,17 @@ }, "shinyngs/staticdifferential": { "branch": "master", - "git_sha": "707c31e838cb77198e30d8eeb138728ce09a4dd2", + "git_sha": "022afb76b0fc7e304b0061a648f8f6cef03bba95", "installed_by": ["modules"] }, "shinyngs/staticexploratory": { "branch": "master", - "git_sha": "707c31e838cb77198e30d8eeb138728ce09a4dd2", + "git_sha": "022afb76b0fc7e304b0061a648f8f6cef03bba95", "installed_by": ["modules"] }, "shinyngs/validatefomcomponents": { "branch": "master", - "git_sha": "707c31e838cb77198e30d8eeb138728ce09a4dd2", + "git_sha": "022afb76b0fc7e304b0061a648f8f6cef03bba95", "installed_by": ["modules"] }, "untar": { diff --git a/modules/nf-core/custom/dumpsoftwareversions/main.nf b/modules/nf-core/custom/dumpsoftwareversions/main.nf index ebc87273..c9d014b1 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/main.nf +++ b/modules/nf-core/custom/dumpsoftwareversions/main.nf @@ -2,10 +2,10 @@ process CUSTOM_DUMPSOFTWAREVERSIONS { label 'process_single' // Requires `pyyaml` which does not have a dedicated container but is in the MultiQC container - conda "bioconda::multiqc=1.14" + conda "bioconda::multiqc=1.15" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.14--pyhdfd78af_0' : - 'biocontainers/multiqc:1.14--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.15--pyhdfd78af_0' : + 'biocontainers/multiqc:1.15--pyhdfd78af_0' }" input: path versions diff --git a/modules/nf-core/shinyngs/staticdifferential/main.nf b/modules/nf-core/shinyngs/staticdifferential/main.nf index cdc40d02..d2bbbc8c 100644 --- a/modules/nf-core/shinyngs/staticdifferential/main.nf +++ b/modules/nf-core/shinyngs/staticdifferential/main.nf @@ -2,10 +2,10 @@ process SHINYNGS_STATICDIFFERENTIAL { tag "$meta.id" label 'process_single' - conda "bioconda::r-shinyngs=1.8.1" + conda "bioconda::r-shinyngs=1.8.2" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/r-shinyngs:1.8.1--r43hdfd78af_0' : - 'biocontainers/r-shinyngs:1.8.1--r43hdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/r-shinyngs:1.8.2--r43hdfd78af_0' : + 'biocontainers/r-shinyngs:1.8.2--r43hdfd78af_0' }" input: tuple val(meta), path(differential_result) // Differential info: contrast and differential stats diff --git a/modules/nf-core/shinyngs/staticexploratory/main.nf b/modules/nf-core/shinyngs/staticexploratory/main.nf index 2c351949..851325bf 100644 --- a/modules/nf-core/shinyngs/staticexploratory/main.nf +++ b/modules/nf-core/shinyngs/staticexploratory/main.nf @@ -2,10 +2,10 @@ process SHINYNGS_STATICEXPLORATORY { tag "$meta.id" label 'process_single' - conda "bioconda::r-shinyngs=1.8.1" + conda "bioconda::r-shinyngs=1.8.2" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/r-shinyngs:1.8.1--r43hdfd78af_0' : - 'biocontainers/r-shinyngs:1.8.1--r43hdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/r-shinyngs:1.8.2--r43hdfd78af_0' : + 'biocontainers/r-shinyngs:1.8.2--r43hdfd78af_0' }" input: tuple val(meta), path(sample), path(feature_meta), path(assay_files) diff --git a/modules/nf-core/shinyngs/validatefomcomponents/main.nf b/modules/nf-core/shinyngs/validatefomcomponents/main.nf index 98fb49e1..bdfb19af 100644 --- a/modules/nf-core/shinyngs/validatefomcomponents/main.nf +++ b/modules/nf-core/shinyngs/validatefomcomponents/main.nf @@ -2,10 +2,10 @@ process SHINYNGS_VALIDATEFOMCOMPONENTS { tag "$sample" label 'process_single' - conda "bioconda::r-shinyngs=1.8.1" + conda "bioconda::r-shinyngs=1.8.2" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/r-shinyngs:1.8.1--r43hdfd78af_0' : - 'biocontainers/r-shinyngs:1.8.1--r43hdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/r-shinyngs:1.8.2--r43hdfd78af_0' : + 'biocontainers/r-shinyngs:1.8.2--r43hdfd78af_0' }" input: tuple val(meta), path(sample), path(assay_files) diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index 8e6096ef..5f2ed24a 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -266,10 +266,6 @@ workflow DIFFERENTIALABUNDANCE { } } - - - - // Check compatibility of FOM elements and contrasts if (params.study_type == 'affy_array' || params.study_type == 'maxquant'){ ch_matrices_for_validation = ch_in_raw From f26a2241f753e5c2aa7e53ca16936f1f33227b00 Mon Sep 17 00:00:00 2001 From: WackerO Date: Mon, 9 Oct 2023 15:42:58 +0200 Subject: [PATCH 30/30] Cleanup, updated changelog, fixed output docs --- CHANGELOG.md | 2 +- assets/differentialabundance_report.Rmd | 10 +++------- docs/output.md | 12 ++++++------ nextflow.config | 2 +- 4 files changed, 11 insertions(+), 15 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 53cdb045..582f2078 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -31,7 +31,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [[#159](https://github.com/nf-core/differentialabundance/issues/159)] - CUSTOM/MATRIXFILTER module update ([@WackerO](https://github.com/WackerO), review by [@suzannejin](https://github.com/suzannejin)) - [[#154](https://github.com/nf-core/differentialabundance/issues/154)] - RMARKDOWNNOTEBOOK env update ([@WackerO](https://github.com/WackerO), review by [@pinin4fjords](https://github.com/pinin4fjords)) - [[#151](https://github.com/nf-core/differentialabundance/issues/151)] - Module update ([@WackerO](https://github.com/WackerO), review by [@pinin4fjords](https://github.com/pinin4fjords)) -- [[#147](https://github.com/nf-core/differentialabundance/pull/147)] - RMARKDOWNNOTEBOOK env update ([@WackerO](https://github.com/WackerO), review by [@pinin4fjords](https://github.com/pinin4fjords)) +- [[#147](https://github.com/nf-core/differentialabundance/pull/147)] - RMARKDOWNNOTEBOOK env update, SHINYNGS and CUSTOM update ([@WackerO](https://github.com/WackerO), review by [@pinin4fjords](https://github.com/pinin4fjords)) ## v1.2.0 - 2023-04-19 diff --git a/assets/differentialabundance_report.Rmd b/assets/differentialabundance_report.Rmd index 669f0514..d6b654b8 100644 --- a/assets/differentialabundance_report.Rmd +++ b/assets/differentialabundance_report.Rmd @@ -241,16 +241,16 @@ assay_names <- simpleSplit(params$exploratory_assay_names) names(assay_names) = assay_names assay_files <- lapply(assay_names, function(x) params[[paste0(x, '_matrix')]]) -assay_data <- lapply(c(1:length(assay_files)), function(x) { +assay_data <- lapply(assay_files, function(x) { mat <- na.omit( read_matrix( - assay_files[[x]], + x, sample_metadata = observations, row.names = 1 ) ) colnames(mat) <- observations[[params$observations_name_col]][match(colnames(mat), rownames(observations))] - mat + mat }) if (!is.null(params$features_log2_assays)) { @@ -259,9 +259,6 @@ if (!is.null(params$features_log2_assays)) { assay_data <- cond_log2_transform_assays(assay_data, features_log2_assays) } -# Lapply does not go over the assay_files themselves anymore, so we need to specifically assign their names to the data -names(assay_data) <- names(assay_files) - # Now we can rename the observations rows using the title field rownames(observations) <- observations[[params$observations_name_col]] @@ -604,7 +601,6 @@ for (assay_type in rev(names(assay_data))){ variable_genes <- selectVariableGenes(matrix = assay_data[[assay_type]], ntop = params$exploratory_n_features) dendroColorScale <- makeColorScale(length(unique(observations[[iv]])), palette = params$exploratory_palette_name) - p <- clusteringDendrogram( 2^assay_data[[assay_type]][variable_genes, ], observations[, iv, drop = FALSE], diff --git a/docs/output.md b/docs/output.md index 88dc5186..f256c505 100644 --- a/docs/output.md +++ b/docs/output.md @@ -38,10 +38,10 @@ Stand-alone graphical outputs are placed in this directory. They may be useful i - `gsea/`: Directory containing graphical outputs from GSEA (where enabled). Plots are stored in directories named for the associated contrast. - `[contrast]/png/[gsea_plot_type].png` - `proteus/`: If `--study_type maxquant`: Directory containing plots produced by the proteus module which is used for processing MaxQuant input. Files are prefixed with the associated contrast and chosen normalization function (if any). - - `[contrast].proteus.[norm_function].normalized_dendrogram.png`: A sample clustering dendrogram after normalization, if chosen. - - `[contrast].proteus.[norm_function].normalized_mean_variance_relationship.png`: Plots of log intensity vs mean log intensity after normalization of each contrast level, if chosen. - - `[contrast].proteus.[norm_function].normalized_distributions.png`: A plot of sample distributions after normalization, if chosen. - - `[contrast].proteus.raw_distributions.png`: A plot of sample distributions without normalization. + - `[contrast]/[norm_function].normalized_dendrogram.png`: A sample clustering dendrogram after normalization. + - `[contrast]/[norm_function].normalized_mean_variance_relationship.png`: Plots of log intensity vs mean log intensity after normalization of each contrast level. + - `[contrast]/[norm_function].normalized_distributions.png`: A plot of sample distributions after normalization. + - `[contrast]/raw_distributions.png`: A plot of sample distributions without normalization. @@ -67,8 +67,8 @@ Most plots are included in the HTML report (see above), but are also included in - `gsea/`: Directory containing tables of differential gene set analyis from GSEA (where enabled) - `[contrast]/[contrast].gsea_report_for_[condition].tsv`: A GSEA report table for each side of each contrast - `proteus/`: If `--study_type maxquant`: Directory containing abundance values produced by the proteus module which is used for processing MaxQuant input. Files are prefixed with the associated contrast and chosen normalization function (if any). - - `[contrast].proteus.[norm_function].normalized_proteingroups_tab.tsv`: Abundance table after normalization, if chosen. - - `[contrast].proteus.raw_proteingroups_tab.tsv`: Abundance table without normalization. + - `[contrast]/[norm_function].normalized_proteingroups_tab.tsv`: Abundance table after normalization. + - `[contrast]/raw_proteingroups_tab.tsv`: Abundance table without normalization. diff --git a/nextflow.config b/nextflow.config index 3fa9fcc7..0e07ee51 100644 --- a/nextflow.config +++ b/nextflow.config @@ -60,7 +60,7 @@ params { // Proteus-specific options proteus_measurecol_prefix = 'LFQ intensity ' - proteus_norm_function = 'normalizeMedian' + proteus_norm_function = 'normalizeMedian' proteus_plotsd_method = 'violin' proteus_plotmv_loess = true proteus_palette_name = 'Set1'