From 618a90d7c52bf23b7cbd4c810a33193d825256ce Mon Sep 17 00:00:00 2001 From: Azedine Zoufir Date: Tue, 13 Jun 2023 10:46:12 +0100 Subject: [PATCH 01/72] R script read from soft matrix --- modules/local/templates/read_soft_matrix.R | 107 +++++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100644 modules/local/templates/read_soft_matrix.R diff --git a/modules/local/templates/read_soft_matrix.R b/modules/local/templates/read_soft_matrix.R new file mode 100644 index 00000000..b1746b3f --- /dev/null +++ b/modules/local/templates/read_soft_matrix.R @@ -0,0 +1,107 @@ +#!/usr/bin/env Rscript + +############################ +# FUNCTIONS +############################ +# From affy/justRMA (pinin4fjords) +# Parse out options from a string without recourse to optparse +# @param x Long-form argument list like --opt1 val1 --opt2 val2 +# return named list of options and values similar to optparse +parse_args <- function(x){ + args_list <- unlist(strsplit(x, ' ?--')[[1]])[-1] + args_vals <- lapply(args_list, function(x) scan(text=x, what='character', quiet = TRUE)) + + # Ensure the option vectors are length 2 (key/ value) to catch empty ones + args_vals <- lapply(args_vals, function(z){ length(z) <- 2; z}) + + parsed_args <- structure(lapply(args_vals, function(x) x[2]), names = lapply(args_vals, function(x) x[1])) + parsed_args[! is.na(parsed_args)] +} +# From affy/justRMA (pinin4fjords) +# Round numeric dataframe columns to fixed decimal places by applying +# formatting and converting back to numerics +# @param dataframe A data frame +# @param columns Which columns to round (assumes all of them by default) +# @param digits How many decimal places to round to? +# @return output Data frame +round_dataframe_columns <- function(df, columns = NULL, digits = 8){ + if (is.null(columns)){ + columns <- colnames(df) + } + + df[,columns] <- format(data.frame(df[, columns]), nsmall = digits) + + # Convert columns back to numeric + + for (c in columns) { + df[[c]][grep("^ *NA$", df[[c]])] <- NA + df[[c]] <- as.numeric(df[[c]]) + } + df +} + +############################ +# PARSE PARAMS FROM NEXTFLOW +############################ + +opt <- list( + queryGSE = '$querygse' +) +args_opt <- parse_args('$task.ext.args') +for ( ao in names(args_opt)){ + if (! ao %in% names(opt)){ + stop(paste("Invalid option:", ao)) + }else{ + opt[[ao]] <- args_opt[[ao]] + } +} + +############################ +# MAIN +############################ + +library(GEOquery) + +# fetch data for GSE number +eset <- getGEO(queryGSE)[[1]] + +# write probeset annotation +write.table(fData(eset)[,c('ID','Entrez_Gene_ID','Symbol','Definition')], + paste0(queryGSE,'.annotation.tsv'), + col.names=TRUE, row.names=FALSE, sep="\t", quote=FALSE) + +output_prefix <- '$task.ext.prefix' +saveRDS(eset, file = paste0(output_prefix, 'eset.rds')) + +# write intensity matrix (normalised) +write.table( + data.frame( + probe_id = rownames(eset), + round_dataframe_columns(as.data.frame(exprs(eset))), + check.names = FALSE + ), + file = paste0(output_prefix, '.matrix.tsv'), + col.names = TRUE, row.names = FALSE, + sep = '\t', quote = FALSE +) + + +############################ +# LOG SESSION AND VERSIONS +############################ + + +sink("R_sessionInfo.log") +print(sessionInfo()) +sink() + +r.version <- strsplit(version[['version.string']], ' ')[[1]][3] +geoquery.version <- as.character(packageVersion("GEOquery")) + +writeLines( + c( + '"${task.process}":', + paste(' r-base:', r.version), + paste(' bioconductor-:', geoquery.version) + ), + 'versions.yml') From 555534091c450c55175ee6a2e83475e529559c6e Mon Sep 17 00:00:00 2001 From: Azedine Zoufir Date: Tue, 13 Jun 2023 10:52:51 +0100 Subject: [PATCH 02/72] add read_from_soft process to pipeline --- modules/local/read_from_geo.nf | 24 ++++++++++++++++++++++++ modules/local/read_from_soft.nf | 24 ++++++++++++++++++++++++ 2 files changed, 48 insertions(+) create mode 100644 modules/local/read_from_geo.nf create mode 100644 modules/local/read_from_soft.nf diff --git a/modules/local/read_from_geo.nf b/modules/local/read_from_geo.nf new file mode 100644 index 00000000..2d618bc8 --- /dev/null +++ b/modules/local/read_from_geo.nf @@ -0,0 +1,24 @@ +process READ_FROM_SOFT { + tag "$meta.id" + label 'process_single' + + conda "bioconda::bioconductor-geoquery=2.66.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bioconductor-geoquery:2.66.0--r42hdfd78af_0': + 'biocontainers/bioconductor-geoquery==2.66.0--r42hdfd78af_0' }" + + input: + tuple val(meta), val(querygse) + + output: + tuple val(meta), path("*.rds") , emit: rds + tuple val(meta), path("*.matrix.tsv") , emit: expression + tuple val(meta), path("*.annotation.tsv") , emit: annotation, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + template 'read_soft_matrix.R' +} diff --git a/modules/local/read_from_soft.nf b/modules/local/read_from_soft.nf new file mode 100644 index 00000000..2d618bc8 --- /dev/null +++ b/modules/local/read_from_soft.nf @@ -0,0 +1,24 @@ +process READ_FROM_SOFT { + tag "$meta.id" + label 'process_single' + + conda "bioconda::bioconductor-geoquery=2.66.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bioconductor-geoquery:2.66.0--r42hdfd78af_0': + 'biocontainers/bioconductor-geoquery==2.66.0--r42hdfd78af_0' }" + + input: + tuple val(meta), val(querygse) + + output: + tuple val(meta), path("*.rds") , emit: rds + tuple val(meta), path("*.matrix.tsv") , emit: expression + tuple val(meta), path("*.annotation.tsv") , emit: annotation, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + template 'read_soft_matrix.R' +} From 1d2f9ece91cb438520e3ac40681f8bd336c283cc Mon Sep 17 00:00:00 2001 From: Azedine Zoufir Date: Tue, 13 Jun 2023 11:03:52 +0100 Subject: [PATCH 03/72] add configuration for read_from_soft process --- conf/modules.config | 24 ++++++++++++++++++++++++ modules/local/read_from_geo.nf | 24 ------------------------ 2 files changed, 24 insertions(+), 24 deletions(-) delete mode 100644 modules/local/read_from_geo.nf diff --git a/conf/modules.config b/conf/modules.config index bdd1ade9..6fdc1893 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -100,6 +100,30 @@ process { ].join(' ').trim() } } + withName: READ_FROM_SOFT { + publishDir = [ + [ + path: { "${params.outdir}/tables/processed_abundance" }, + mode: params.publish_dir_mode, + pattern: '*.matrix.tsv' + ], + [ + path: { "${params.outdir}/tables/annotation" }, + mode: params.publish_dir_mode, + pattern: '*.annotation.tsv' + ], + [ + path: { "${params.outdir}/other/affy" }, + mode: params.publish_dir_mode, + pattern: '*.{rds,sessionInfo.log}' + ] + ] + ext.prefix = { "normalised." } + ext.args = { + "--querygse \"${params.querygse}\"".trim() + } + } + withName: DESEQ2_DIFFERENTIAL { publishDir = [ [ diff --git a/modules/local/read_from_geo.nf b/modules/local/read_from_geo.nf deleted file mode 100644 index 2d618bc8..00000000 --- a/modules/local/read_from_geo.nf +++ /dev/null @@ -1,24 +0,0 @@ -process READ_FROM_SOFT { - tag "$meta.id" - label 'process_single' - - conda "bioconda::bioconductor-geoquery=2.66.0" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/bioconductor-geoquery:2.66.0--r42hdfd78af_0': - 'biocontainers/bioconductor-geoquery==2.66.0--r42hdfd78af_0' }" - - input: - tuple val(meta), val(querygse) - - output: - tuple val(meta), path("*.rds") , emit: rds - tuple val(meta), path("*.matrix.tsv") , emit: expression - tuple val(meta), path("*.annotation.tsv") , emit: annotation, optional: true - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - template 'read_soft_matrix.R' -} From 0b48cef32f88cc7c9a691f1c9b607a90b0f28485 Mon Sep 17 00:00:00 2001 From: Azedine Zoufir Date: Tue, 13 Jun 2023 12:29:27 +0100 Subject: [PATCH 04/72] incorporate read from soft process into workflow --- modules/local/read_from_soft.nf | 4 +-- workflows/differentialabundance.nf | 39 ++++++++++++++++++++++++------ 2 files changed, 34 insertions(+), 9 deletions(-) diff --git a/modules/local/read_from_soft.nf b/modules/local/read_from_soft.nf index 2d618bc8..f10e8650 100644 --- a/modules/local/read_from_soft.nf +++ b/modules/local/read_from_soft.nf @@ -12,8 +12,8 @@ process READ_FROM_SOFT { output: tuple val(meta), path("*.rds") , emit: rds - tuple val(meta), path("*.matrix.tsv") , emit: expression - tuple val(meta), path("*.annotation.tsv") , emit: annotation, optional: true + tuple val(meta), path("*.matrix.tsv") , emit: expression + tuple val(meta), path("*.annotation.tsv") , emit: annotation path "versions.yml" , emit: versions when: diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index 198f2a71..1098673b 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -22,16 +22,24 @@ if (params.study_type == 'affy_array'){ } else { error("CEL files archive not specified!") } -} else{ - - // If this is not an affy array, assume we're reading from a matrix - - if (params.matrix) { + // If this is another array platform and user wish to read from SOFT files + // then a GSE study identifier must be provided +} else if (params.study_type == 'non_affy_array'){ + if (params.querygse) { + ch_querygse = Channel.of([exp_meta, value(params.querygse, checkIfExists: true)]) + } else { + error("Query GSE not specified!") + } +} else { + // If this is not microarray data, and this an RNA-seq dataset, + // then assume we're reading from a matrix + if (params.study_type == "rnaseq" && params.matrix) { matrix_file = file(params.matrix, checkIfExists: true) ch_in_raw = Channel.of([ exp_meta, matrix_file]) } else { error("Input matrix not specified!") } + } // Check optional parameters @@ -64,6 +72,8 @@ citations_file = file(params.citations_file, checkIfExists: true) */ include { TABULAR_TO_GSEA_CHIP } from '../modules/local/tabular_to_gsea_chip' +include { READ_FROM_SOFT } from '../modules/local/read_from_soft' + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -172,6 +182,15 @@ workflow DIFFERENTIALABUNDANCE { ch_versions = ch_versions .mix(GTF_TO_TABLE.out.versions) } + else if(params.study_type == "non_affy_array"){ + + ch_generic_array_input = ch_input + .join(ch_querygse) + + READ_FROM_SOFT(ch_generic_array_input) + ch_in_raw = READ_FROM_SOFT.out.expression + ch_features = READ_FROM_SOFT.out.annotation + } else{ // Otherwise we can just use the matrix input @@ -214,7 +233,13 @@ workflow DIFFERENTIALABUNDANCE { ch_raw = ch_validated_assays.raw ch_norm = ch_validated_assays.normalised ch_matrix_for_differential = ch_norm - } else{ + } + else if (params.study_type == 'non_affy_array') { + ch_raw = VALIDATOR.out.assays + ch_norm = VALIDATOR.out.assays + ch_matrix_for_differential = ch_norm + } + else{ ch_raw = VALIDATOR.out.assays ch_matrix_for_differential = ch_raw } @@ -247,7 +272,7 @@ workflow DIFFERENTIALABUNDANCE { .join(CUSTOM_MATRIXFILTER.out.filtered) // -> meta, samplesheet, filtered matrix .first() - if (params.study_type == 'affy_array'){ + if (params.study_type == 'affy_array' || 'non_affy_array'){ LIMMA_DIFFERENTIAL ( ch_contrasts, From 4b4aad65ba30bf44db5b5532a73ed7888cf47956 Mon Sep 17 00:00:00 2001 From: Azedine Zoufir Date: Tue, 13 Jun 2023 12:30:53 +0100 Subject: [PATCH 05/72] add querygse as an input parameter to workflow --- nextflow.config | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/nextflow.config b/nextflow.config index 731494e9..6730ba00 100644 --- a/nextflow.config +++ b/nextflow.config @@ -46,6 +46,7 @@ params { // Affy-specific options affy_cel_files_archive = null + querygse = null affy_file_name_col = 'file' affy_background = true affy_bgversion = 2 @@ -72,9 +73,9 @@ params { exploratory_assay_names = "raw,normalised,variance_stabilised" exploratory_final_assay = "variance_stabilised" exploratory_palette_name = 'Set1' - + // Differential options - differential_file_suffix = ".deseq2.results.tsv" + differential_file_suffix = ".deseq2.results.tsv" differential_feature_id_column = "gene_id" differential_feature_name_column = "gene_name" differential_fc_column = "log2FoldChange" @@ -86,7 +87,7 @@ params { differential_foldchanges_logged = true differential_palette_name = 'Set1' differential_subset_to_contrast_samples = false - + // DESeq2-specific options deseq2_test = "Wald" deseq2_fit_type = "parametric" @@ -126,13 +127,13 @@ params { gsea_nperm = 1000 gsea_permute = 'phenotype' - gsea_scoring_scheme = 'weighted' - gsea_metric = 'Signal2Noise' + gsea_scoring_scheme = 'weighted' + gsea_metric = 'Signal2Noise' gsea_sort = 'real' gsea_order = 'descending' gsea_set_max = 500 gsea_set_min = 15 - + gsea_norm = 'meandiv' gsea_rnd_type = 'no_balance' gsea_make_sets = true @@ -140,18 +141,18 @@ params { gsea_num = 100 gsea_plot_top_x = 20 gsea_rnd_seed = 'timestamp' - gsea_save_rnd_lists = false + gsea_save_rnd_lists = false gsea_zip_report = false - + gsea_gene_sets = null // ShinyNGS shinyngs_build_app = true - shinyngs_guess_unlog_matrices = true + shinyngs_guess_unlog_matrices = true // Note: for shinyapps deployment, in addition to setting these values, // SHINYAPPS_TOKEN and SHINYAPPS_SECRET must be available to the - // environment, probably via Nextflow secrets + // environment, probably via Nextflow secrets shinyngs_deploy_to_shinyapps_io = false shinyngs_shinyapps_account = null shinyngs_shinyapps_app_name = null From 27ec5120b87b7134cbd3d9799db2dc850d6a4385 Mon Sep 17 00:00:00 2001 From: Azedine Zoufir Date: Tue, 13 Jun 2023 12:46:03 +0100 Subject: [PATCH 06/72] add querygse to schema --- nextflow_schema.json | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/nextflow_schema.json b/nextflow_schema.json index 80241d6a..9284d32a 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -81,6 +81,13 @@ "description": "Alternative to matrix: a compressed CEL files archive such as often found in GEO", "fa_icon": "fas fa-file-archive", "help_text": "Use this option to provide a raw archive of CEL files from Affymetrix arrays. Will be ignored if a matrix is specified." + }, + "querygse": { + "type": "string", + "default": "None", + "description": "Alternative to CEL archive: the GSE identifier as found in GEO", + "fa_icon": "fas fa-keyboard", + "help_text": "Use this option to provide a GSE study identifier." } } }, From cfa223c2b19d17399018cbca5687de734a85730a Mon Sep 17 00:00:00 2001 From: Azedine Zoufir Date: Tue, 13 Jun 2023 13:25:34 +0100 Subject: [PATCH 07/72] add 'non_affy_array' as possible value for study type param --- nextflow_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 9284d32a..a01c87ff 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -24,7 +24,7 @@ "default": "rnaseq", "description": "A string identifying the technology used to produce the data", "help_text": "Currently 'rnaseq' or 'affy_array' may be specified.", - "enum": ["rnaseq", "affy_array"], + "enum": ["rnaseq", "affy_array", "non_affy_array"], "fa_icon": "far fa-keyboard" }, "input": { From 8e32b081563d61c113281a89005139f1d766f138 Mon Sep 17 00:00:00 2001 From: Azedine Zoufir Date: Wed, 14 Jun 2023 13:30:57 +0100 Subject: [PATCH 08/72] fixing configs for read_from_soft process --- modules/local/read_from_soft.nf | 6 +- modules/local/templates/read_soft_matrix.R | 22 +++--- nextflow.config | 2 +- workflows/differentialabundance.nf | 86 +++++++++++----------- 4 files changed, 58 insertions(+), 58 deletions(-) diff --git a/modules/local/read_from_soft.nf b/modules/local/read_from_soft.nf index f10e8650..e8addf0b 100644 --- a/modules/local/read_from_soft.nf +++ b/modules/local/read_from_soft.nf @@ -4,15 +4,15 @@ process READ_FROM_SOFT { conda "bioconda::bioconductor-geoquery=2.66.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/bioconductor-geoquery:2.66.0--r42hdfd78af_0': + 'https://depot.galaxyproject.org/singularity/bioconductor-geoquery:2.66.0--r42hdfd78af_0' : 'biocontainers/bioconductor-geoquery==2.66.0--r42hdfd78af_0' }" input: - tuple val(meta), val(querygse) + tuple val(meta), path(samplesheet), val(querygse) output: tuple val(meta), path("*.rds") , emit: rds - tuple val(meta), path("*.matrix.tsv") , emit: expression + tuple val(meta), path("*matrix.tsv") , emit: expression tuple val(meta), path("*.annotation.tsv") , emit: annotation path "versions.yml" , emit: versions diff --git a/modules/local/templates/read_soft_matrix.R b/modules/local/templates/read_soft_matrix.R index b1746b3f..e0818f77 100644 --- a/modules/local/templates/read_soft_matrix.R +++ b/modules/local/templates/read_soft_matrix.R @@ -10,10 +10,10 @@ parse_args <- function(x){ args_list <- unlist(strsplit(x, ' ?--')[[1]])[-1] args_vals <- lapply(args_list, function(x) scan(text=x, what='character', quiet = TRUE)) - + # Ensure the option vectors are length 2 (key/ value) to catch empty ones args_vals <- lapply(args_vals, function(z){ length(z) <- 2; z}) - + parsed_args <- structure(lapply(args_vals, function(x) x[2]), names = lapply(args_vals, function(x) x[1])) parsed_args[! is.na(parsed_args)] } @@ -28,13 +28,13 @@ round_dataframe_columns <- function(df, columns = NULL, digits = 8){ if (is.null(columns)){ columns <- colnames(df) } - + df[,columns] <- format(data.frame(df[, columns]), nsmall = digits) - + # Convert columns back to numeric - + for (c in columns) { - df[[c]][grep("^ *NA$", df[[c]])] <- NA + df[[c]][grep("^ *NA\$", df[[c]])] <- NA df[[c]] <- as.numeric(df[[c]]) } df @@ -45,7 +45,7 @@ round_dataframe_columns <- function(df, columns = NULL, digits = 8){ ############################ opt <- list( - queryGSE = '$querygse' + querygse = '$querygse' ) args_opt <- parse_args('$task.ext.args') for ( ao in names(args_opt)){ @@ -63,11 +63,11 @@ for ( ao in names(args_opt)){ library(GEOquery) # fetch data for GSE number -eset <- getGEO(queryGSE)[[1]] +eset <- getGEO(opt\$querygse)[[1]] # write probeset annotation -write.table(fData(eset)[,c('ID','Entrez_Gene_ID','Symbol','Definition')], - paste0(queryGSE,'.annotation.tsv'), +write.table(fData(eset)[,c('ID','Entrez_Gene_ID','Symbol','Definition')], + paste0(opt\$querygse,'.annotation.tsv'), col.names=TRUE, row.names=FALSE, sep="\t", quote=FALSE) output_prefix <- '$task.ext.prefix' @@ -80,7 +80,7 @@ write.table( round_dataframe_columns(as.data.frame(exprs(eset))), check.names = FALSE ), - file = paste0(output_prefix, '.matrix.tsv'), + file = paste0(output_prefix, 'matrix.tsv'), col.names = TRUE, row.names = FALSE, sep = '\t', quote = FALSE ) diff --git a/nextflow.config b/nextflow.config index 6730ba00..5650acd7 100644 --- a/nextflow.config +++ b/nextflow.config @@ -15,6 +15,7 @@ params { study_type = 'rnaseq' study_abundance_type = 'counts' contrasts = null + querygse = null matrix = null control_features = null sizefactors_from_controls = null @@ -46,7 +47,6 @@ params { // Affy-specific options affy_cel_files_archive = null - querygse = null affy_file_name_col = 'file' affy_background = true affy_bgversion = 2 diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index 1098673b..8491f988 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -24,9 +24,9 @@ if (params.study_type == 'affy_array'){ } // If this is another array platform and user wish to read from SOFT files // then a GSE study identifier must be provided -} else if (params.study_type == 'non_affy_array'){ +} else if (params.study_type == 'non_affy_array' && params.querygse != ""){ if (params.querygse) { - ch_querygse = Channel.of([exp_meta, value(params.querygse, checkIfExists: true)]) + ch_querygse = Channel.of([exp_meta, params.querygse]) } else { error("Query GSE not specified!") } @@ -147,58 +147,58 @@ workflow DIFFERENTIALABUNDANCE { ch_affy_platform_features = AFFY_JUSTRMA_RAW.out.annotation } + else if(params.study_type == 'non_affy_array'){ - //// Fetch or derive a feature annotation table - - // If user has provided a feature annotation table, use that - - if (params.features){ - ch_features = Channel.of([ exp_meta, file(params.features, checkIfExists: true)]) - } else if (params.study_type == 'affy_array'){ - ch_features = ch_affy_platform_features - } else if (params.gtf){ - // Get feature annotations from a GTF file, gunzip if necessary - - file_gtf_in = file(params.gtf) - file_gtf = [ [ "id": file_gtf_in.simpleName ], file_gtf_in ] + ch_generic_array_input = ch_input + .join(ch_querygse) - if ( params.gtf.endsWith('.gz') ){ - GUNZIP_GTF(file_gtf) - file_gtf = GUNZIP_GTF.out.gunzip - ch_versions = ch_versions.mix(GUNZIP_GTF.out.versions) - } + READ_FROM_SOFT(ch_generic_array_input) + ch_in_raw = READ_FROM_SOFT.out.expression + ch_features = READ_FROM_SOFT.out.annotation + } - // Get a features table from the GTF and combine with the matrix and sample - // annotation (fom = features/ observations/ matrix) + //// Fetch or derive a feature annotation table - GTF_TO_TABLE( file_gtf, [[ "id":""], []]) - ch_features = GTF_TO_TABLE.out.feature_annotation - .map{ - tuple( exp_meta, it[1]) + // If user has provided a feature annotation table, use that + if(params.study_type != 'non_affy_array') { + if (params.features){ + ch_features = Channel.of([ exp_meta, file(params.features, checkIfExists: true)]) + } else if (params.study_type == 'affy_array'){ + ch_features = ch_affy_platform_features + } else if (params.gtf){ + // Get feature annotations from a GTF file, gunzip if necessary + + file_gtf_in = file(params.gtf) + file_gtf = [ [ "id": file_gtf_in.simpleName ], file_gtf_in ] + + if ( params.gtf.endsWith('.gz') ){ + GUNZIP_GTF(file_gtf) + file_gtf = GUNZIP_GTF.out.gunzip + ch_versions = ch_versions.mix(GUNZIP_GTF.out.versions) } - // Record the version of the GTF -> table tool + // Get a features table from the GTF and combine with the matrix and sample + // annotation (fom = features/ observations/ matrix) - ch_versions = ch_versions - .mix(GTF_TO_TABLE.out.versions) - } - else if(params.study_type == "non_affy_array"){ + GTF_TO_TABLE( file_gtf, [[ "id":""], []]) + ch_features = GTF_TO_TABLE.out.feature_annotation + .map{ + tuple( exp_meta, it[1]) + } - ch_generic_array_input = ch_input - .join(ch_querygse) + // Record the version of the GTF -> table tool - READ_FROM_SOFT(ch_generic_array_input) - ch_in_raw = READ_FROM_SOFT.out.expression - ch_features = READ_FROM_SOFT.out.annotation - } - else{ + ch_versions = ch_versions + .mix(GTF_TO_TABLE.out.versions) + } + else{ - // Otherwise we can just use the matrix input - matrix_as_anno_filename = "matrix_as_anno.${matrix_file.getExtension()}" - matrix_file.copyTo(matrix_as_anno_filename) - ch_features = Channel.of([ exp_meta, file(matrix_as_anno_filename)]) + // Otherwise we can just use the matrix input + matrix_as_anno_filename = "matrix_as_anno.${matrix_file.getExtension()}" + matrix_file.copyTo(matrix_as_anno_filename) + ch_features = Channel.of([ exp_meta, file(matrix_as_anno_filename)]) + } } - // Channel for the contrasts file ch_contrasts_file = Channel.from([[exp_meta, file(params.contrasts)]]) From 3af5ed33cea1f78a27a6ab18a06b50c477bd7dbe Mon Sep 17 00:00:00 2001 From: Azedine Zoufir Date: Wed, 14 Jun 2023 14:53:06 +0100 Subject: [PATCH 09/72] fix geoquery container url in read_from_soft.nf --- modules/local/read_from_soft.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/read_from_soft.nf b/modules/local/read_from_soft.nf index e8addf0b..011206df 100644 --- a/modules/local/read_from_soft.nf +++ b/modules/local/read_from_soft.nf @@ -5,7 +5,7 @@ process READ_FROM_SOFT { conda "bioconda::bioconductor-geoquery=2.66.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/bioconductor-geoquery:2.66.0--r42hdfd78af_0' : - 'biocontainers/bioconductor-geoquery==2.66.0--r42hdfd78af_0' }" + 'quay.io/biocontainers/bioconductor-geoquery:2.66.0--r42hdfd78af_0' }" input: tuple val(meta), path(samplesheet), val(querygse) From 122767476513a1dbbc8787f9a8e132b6de221d9f Mon Sep 17 00:00:00 2001 From: Azedine Zoufir Date: Wed, 14 Jun 2023 16:07:34 +0100 Subject: [PATCH 10/72] remove raw matrix processing for [non_affy_array] track --- workflows/differentialabundance.nf | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index 8491f988..a5ab9b74 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -235,7 +235,6 @@ workflow DIFFERENTIALABUNDANCE { ch_matrix_for_differential = ch_norm } else if (params.study_type == 'non_affy_array') { - ch_raw = VALIDATOR.out.assays ch_norm = VALIDATOR.out.assays ch_matrix_for_differential = ch_norm } @@ -384,6 +383,7 @@ workflow DIFFERENTIALABUNDANCE { } .unique() + if(params.study_type != "non_affy_array") { ch_all_matrices = VALIDATOR.out.sample_meta // meta, samples .join(VALIDATOR.out.feature_meta) // meta, samples, features .join(ch_raw) // meta, samples, features, raw matrix @@ -392,6 +392,16 @@ workflow DIFFERENTIALABUNDANCE { tuple(it[0], it[1], it[2], it[3..it.size()-1]) } .first() + } + else { + ch_all_matrices = VALIDATOR.out.sample_meta // meta, samples + .join(VALIDATOR.out.feature_meta) // meta, samples, features + .combine(ch_processed_matrices) // meta, samples, features, norm, ... + .map{ + tuple(it[0], it[1], it[2..it.size()-1]) + } + .first() + } ch_contrast_variables .combine(ch_all_matrices.map{ it.tail() }) @@ -483,7 +493,7 @@ workflow DIFFERENTIALABUNDANCE { // Condition params reported on study type def params_pattern = ~/^(report|study|observations|features|filtering|exploratory|differential|deseq2|gsea).*/ - if (params.study_type == 'affy_array'){ + if (params.study_type == 'affy_array' || 'non_affy_array'){ params_pattern = ~/^(report|study|observations|features|filtering|exploratory|differential|affy|limma|gsea).*/ } From baaf7916672e087fe30cb44d5e3f7adc56b27764 Mon Sep 17 00:00:00 2001 From: Azedine Zoufir Date: Wed, 14 Jun 2023 16:36:03 +0100 Subject: [PATCH 11/72] remove raw matrix processing for [non_affy_array] track --- workflows/differentialabundance.nf | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index a5ab9b74..5000f02e 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -384,23 +384,23 @@ workflow DIFFERENTIALABUNDANCE { .unique() if(params.study_type != "non_affy_array") { - ch_all_matrices = VALIDATOR.out.sample_meta // meta, samples - .join(VALIDATOR.out.feature_meta) // meta, samples, features - .join(ch_raw) // meta, samples, features, raw matrix - .combine(ch_processed_matrices) // meta, samples, features, raw, norm, ... - .map{ - tuple(it[0], it[1], it[2], it[3..it.size()-1]) - } - .first() + ch_all_matrices = VALIDATOR.out.sample_meta // meta, samples + .join(VALIDATOR.out.feature_meta) // meta, samples, features + .join(ch_raw) // meta, samples, features, raw matrix + .combine(ch_processed_matrices) // meta, samples, features, raw, norm, ... + .map{ + tuple(it[0], it[1], it[2], it[3..it.size()-1]) + } + .first() } else { - ch_all_matrices = VALIDATOR.out.sample_meta // meta, samples - .join(VALIDATOR.out.feature_meta) // meta, samples, features - .combine(ch_processed_matrices) // meta, samples, features, norm, ... - .map{ - tuple(it[0], it[1], it[2..it.size()-1]) - } - .first() + ch_all_matrices = VALIDATOR.out.sample_meta // meta, samples + .join(VALIDATOR.out.feature_meta) // meta, samples, features + .join(ch_processed_matrices) // meta, samples, features, norm, ... + .map{ + tuple(it[0], it[1], it[2]) + } + .first() } ch_contrast_variables From e1ff15c65aba2b556ac54d3576af7196f0ea696f Mon Sep 17 00:00:00 2001 From: Azedine Zoufir Date: Wed, 14 Jun 2023 17:23:30 +0100 Subject: [PATCH 12/72] attempt to fix premature completion of pipeline --- workflows/differentialabundance.nf | 1 - 1 file changed, 1 deletion(-) diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index 5000f02e..bcab854c 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -400,7 +400,6 @@ workflow DIFFERENTIALABUNDANCE { .map{ tuple(it[0], it[1], it[2]) } - .first() } ch_contrast_variables From 3b773a8c1243cb14cf8b3f19df078447fd150854 Mon Sep 17 00:00:00 2001 From: Azedine Zoufir Date: Wed, 14 Jun 2023 17:41:16 +0100 Subject: [PATCH 13/72] attempt to fix premature completion of pipeline --- workflows/differentialabundance.nf | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index bcab854c..16853b20 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -398,15 +398,16 @@ workflow DIFFERENTIALABUNDANCE { .join(VALIDATOR.out.feature_meta) // meta, samples, features .join(ch_processed_matrices) // meta, samples, features, norm, ... .map{ - tuple(it[0], it[1], it[2]) + tuple(it[0], it[1], it[2..it.size()-1]) } + .first() } - ch_contrast_variables - .combine(ch_all_matrices.map{ it.tail() }) - - ch_contrast_variables - .combine(ch_all_matrices.map{ it.tail() }) +// ch_contrast_variables +// .combine(ch_all_matrices.map{ it.tail() }) +// +// ch_contrast_variables +// .combine(ch_all_matrices.map{ it.tail() }) PLOT_EXPLORATORY( ch_contrast_variables From 50682952e10feff7fa2d4880086246d5fd37492a Mon Sep 17 00:00:00 2001 From: Azedine Zoufir Date: Wed, 14 Jun 2023 18:18:12 +0100 Subject: [PATCH 14/72] debug around contrasts & matrix filter --- workflows/differentialabundance.nf | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index 16853b20..c70f20ab 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -257,6 +257,8 @@ workflow DIFFERENTIALABUNDANCE { } tuple(it, it.variable, it.reference, it.target) } + println "contrasts" + ch.contrast.view() // Firstly Filter the input matrix @@ -270,6 +272,8 @@ workflow DIFFERENTIALABUNDANCE { ch_samples_and_matrix = VALIDATOR.out.sample_meta .join(CUSTOM_MATRIXFILTER.out.filtered) // -> meta, samplesheet, filtered matrix .first() + println "matrixfilter" + ch_samples_and_matrix.view() if (params.study_type == 'affy_array' || 'non_affy_array'){ From 03e30fdd17013bc0f936d8b8e3eab4e0de31d000 Mon Sep 17 00:00:00 2001 From: Azedine Zoufir Date: Wed, 14 Jun 2023 18:58:15 +0100 Subject: [PATCH 15/72] debug around contrasts & matrix filter --- workflows/differentialabundance.nf | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index c70f20ab..7449cba5 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -257,8 +257,8 @@ workflow DIFFERENTIALABUNDANCE { } tuple(it, it.variable, it.reference, it.target) } - println "contrasts" - ch.contrast.view() + println "contrasts ----------------" + ch_contrasts.view() // Firstly Filter the input matrix @@ -272,7 +272,7 @@ workflow DIFFERENTIALABUNDANCE { ch_samples_and_matrix = VALIDATOR.out.sample_meta .join(CUSTOM_MATRIXFILTER.out.filtered) // -> meta, samplesheet, filtered matrix .first() - println "matrixfilter" + println "matrixfilter ----------------" ch_samples_and_matrix.view() if (params.study_type == 'affy_array' || 'non_affy_array'){ @@ -378,6 +378,8 @@ workflow DIFFERENTIALABUNDANCE { .mix(GSEA_GSEA.out.versions) } + println "are we even getting here? ----------------" + // The exploratory plots are made by coloring by every unique variable used // to define contrasts From 054275a4f4c8747c2ac877155bbd35e3e1b6bec8 Mon Sep 17 00:00:00 2001 From: Azedine Zoufir Date: Wed, 14 Jun 2023 19:35:03 +0100 Subject: [PATCH 16/72] debug matrix input processing for exploratory plots in [non_affy_array] track --- workflows/differentialabundance.nf | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index 7449cba5..9ba60240 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -156,7 +156,6 @@ workflow DIFFERENTIALABUNDANCE { ch_in_raw = READ_FROM_SOFT.out.expression ch_features = READ_FROM_SOFT.out.annotation } - //// Fetch or derive a feature annotation table // If user has provided a feature annotation table, use that @@ -257,8 +256,6 @@ workflow DIFFERENTIALABUNDANCE { } tuple(it, it.variable, it.reference, it.target) } - println "contrasts ----------------" - ch_contrasts.view() // Firstly Filter the input matrix @@ -272,7 +269,6 @@ workflow DIFFERENTIALABUNDANCE { ch_samples_and_matrix = VALIDATOR.out.sample_meta .join(CUSTOM_MATRIXFILTER.out.filtered) // -> meta, samplesheet, filtered matrix .first() - println "matrixfilter ----------------" ch_samples_and_matrix.view() if (params.study_type == 'affy_array' || 'non_affy_array'){ @@ -378,7 +374,6 @@ workflow DIFFERENTIALABUNDANCE { .mix(GSEA_GSEA.out.versions) } - println "are we even getting here? ----------------" // The exploratory plots are made by coloring by every unique variable used // to define contrasts @@ -402,9 +397,9 @@ workflow DIFFERENTIALABUNDANCE { else { ch_all_matrices = VALIDATOR.out.sample_meta // meta, samples .join(VALIDATOR.out.feature_meta) // meta, samples, features - .join(ch_processed_matrices) // meta, samples, features, norm, ... + .combine(ch_processed_matrices) // meta, samples, features, norm, ... .map{ - tuple(it[0], it[1], it[2..it.size()-1]) + tuple(it[0], it[1], it[2], it[3..it.size()-1]) } .first() } From 4092a930681529d290d9d96c84de93ddd6bdd7f3 Mon Sep 17 00:00:00 2001 From: Azedine Zoufir Date: Wed, 14 Jun 2023 20:19:43 +0100 Subject: [PATCH 17/72] do not filter matrix in [non_affy_array] track --- workflows/differentialabundance.nf | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index 9ba60240..289218c6 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -258,18 +258,21 @@ workflow DIFFERENTIALABUNDANCE { } // Firstly Filter the input matrix - - CUSTOM_MATRIXFILTER( - ch_matrix_for_differential, - VALIDATOR.out.sample_meta - ) - - // Prepare inputs for differential processes - - ch_samples_and_matrix = VALIDATOR.out.sample_meta - .join(CUSTOM_MATRIXFILTER.out.filtered) // -> meta, samplesheet, filtered matrix - .first() - ch_samples_and_matrix.view() + if(params.study_type != 'non_affy_array'){ + CUSTOM_MATRIXFILTER( + ch_matrix_for_differential, + VALIDATOR.out.sample_meta + ) + // Prepare inputs for differential processes + ch_samples_and_matrix = VALIDATOR.out.sample_meta + .join(CUSTOM_MATRIXFILTER.out.filtered) // -> meta, samplesheet, filtered matrix + .first() + } + else { + ch_samples_and_matrix = VALIDATOR.out.sample_meta + .join(ch_matrix_for_differential) + .first() + } if (params.study_type == 'affy_array' || 'non_affy_array'){ From 7cc0a1709a5b01426e2770b9cbb4e4997b097ac4 Mon Sep 17 00:00:00 2001 From: Azedine Zoufir Date: Thu, 15 Jun 2023 13:21:17 +0100 Subject: [PATCH 18/72] revert debug and apply log2 transformation to READ_FROM_SOFT output --- modules/local/templates/read_soft_matrix.R | 7 ++++++ workflows/differentialabundance.nf | 26 +++++++++------------- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/modules/local/templates/read_soft_matrix.R b/modules/local/templates/read_soft_matrix.R index e0818f77..7bdc97b9 100644 --- a/modules/local/templates/read_soft_matrix.R +++ b/modules/local/templates/read_soft_matrix.R @@ -70,6 +70,13 @@ write.table(fData(eset)[,c('ID','Entrez_Gene_ID','Symbol','Definition')], paste0(opt\$querygse,'.annotation.tsv'), col.names=TRUE, row.names=FALSE, sep="\t", quote=FALSE) + +# if data is not log scale, transform it as needed for limma downstream +if(max(exprs(eset)) > 20) { # a bit dirty, needs proper solution later... + exprs(eset)[exprs(eset) <= 0] <- .001 + exprs(eset) <- log2(exprs(eset)) +} + output_prefix <- '$task.ext.prefix' saveRDS(eset, file = paste0(output_prefix, 'eset.rds')) diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index 289218c6..88fb9c76 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -258,21 +258,17 @@ workflow DIFFERENTIALABUNDANCE { } // Firstly Filter the input matrix - if(params.study_type != 'non_affy_array'){ - CUSTOM_MATRIXFILTER( - ch_matrix_for_differential, - VALIDATOR.out.sample_meta - ) - // Prepare inputs for differential processes - ch_samples_and_matrix = VALIDATOR.out.sample_meta - .join(CUSTOM_MATRIXFILTER.out.filtered) // -> meta, samplesheet, filtered matrix - .first() - } - else { - ch_samples_and_matrix = VALIDATOR.out.sample_meta - .join(ch_matrix_for_differential) - .first() - } + + CUSTOM_MATRIXFILTER( + ch_matrix_for_differential, + VALIDATOR.out.sample_meta + ) + + // Prepare inputs for differential processes + + ch_samples_and_matrix = VALIDATOR.out.sample_meta + .join(CUSTOM_MATRIXFILTER.out.filtered) // -> meta, samplesheet, filtered matrix + .first() if (params.study_type == 'affy_array' || 'non_affy_array'){ From 8df1f282285de3f5e931b43015f67cf845df5991 Mon Sep 17 00:00:00 2001 From: Azedine Zoufir Date: Thu, 15 Jun 2023 13:43:28 +0100 Subject: [PATCH 19/72] add test SOFT matrix track config --- conf/test_soft_array.config | 51 +++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 conf/test_soft_array.config diff --git a/conf/test_soft_array.config b/conf/test_soft_array.config new file mode 100644 index 00000000..c595ea73 --- /dev/null +++ b/conf/test_soft_array.config @@ -0,0 +1,51 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running affymetrix array analysis +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines settings specific to affy array analysis + + Use as follows: + nextflow run nf-core/differentialabundance -profile --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + + config_profile_name = 'SOFT matrix track test profile' + config_profile_description = 'Minimal settings for test of the SOFT matrix track' + + // Input + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/differentialabundance/testdata/GSE50790.csv' + contrasts = 'https://raw.githubusercontent.com/nf-core/test-datasets/differentialabundance/testdata/GSE50790_contrasts.csv' + querygse = 'GSE50790' + + // Study + study_type = 'non_affy_array' + study_abundance_type = 'intensities' + + // Observations + observations_id_col = 'name' + observations_name_col = 'name' + + + // Features + features_id_col = 'PROBEID' + features_metadata_cols = 'PROBEID,ENSEMBL,SYMBOL,GENETYPE' + features_name_col = 'SYMBOL' + + + // Exploratory + exploratory_assay_names = 'normalised' + exploratory_final_assay = 'normalised' + + // Differential options + differential_file_suffix = ".limma.results.tsv" + differential_fc_column = "logFC" + differential_pval_column = "P.Value" + differential_qval_column = "adj.P.Val" + differential_feature_id_column = "probe_id" + differential_feature_name_column = "Symbol" + +} + From b4b8f8b00b7b957fcc6688a141c5c3ab17532aac Mon Sep 17 00:00:00 2001 From: Azedine Zoufir Date: Thu, 15 Jun 2023 13:53:19 +0100 Subject: [PATCH 20/72] add test SOFT matrix track config --- nextflow.config | 1 + 1 file changed, 1 insertion(+) diff --git a/nextflow.config b/nextflow.config index 5650acd7..ebcedcd0 100644 --- a/nextflow.config +++ b/nextflow.config @@ -311,6 +311,7 @@ profiles { affy { includeConfig 'conf/affy.config' } rnaseq { includeConfig 'conf/rnaseq.config' } test_affy { includeConfig 'conf/test_affy.config' } + test_soft {includeConfig} 'conf/test_soft_array.config' } } // Set default registry for Docker and Podman independent of -profile From 735999a5ba9c4d89a594b2e44b7c06f7e798d914 Mon Sep 17 00:00:00 2001 From: Azedine Zoufir Date: Thu, 15 Jun 2023 13:58:06 +0100 Subject: [PATCH 21/72] fix typo in config file --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index ebcedcd0..42a85113 100644 --- a/nextflow.config +++ b/nextflow.config @@ -311,7 +311,7 @@ profiles { affy { includeConfig 'conf/affy.config' } rnaseq { includeConfig 'conf/rnaseq.config' } test_affy { includeConfig 'conf/test_affy.config' } - test_soft {includeConfig} 'conf/test_soft_array.config' } + test_soft {includeConfig 'conf/test_soft_array.config' } } // Set default registry for Docker and Podman independent of -profile From ab1cb7141a9f16a0c7ee3cb2bfcc2ecb937e0468 Mon Sep 17 00:00:00 2001 From: Azedine Zoufir Date: Thu, 15 Jun 2023 15:58:04 +0100 Subject: [PATCH 22/72] add extra input to READ_SOFT_MATRIX to process metacolumns (differ by array platform) --- conf/modules.config | 8 +++++--- conf/test_soft_array.config | 6 +++--- modules/local/read_from_soft.nf | 2 +- modules/local/templates/read_soft_matrix.R | 7 ++++++- workflows/differentialabundance.nf | 6 +++--- 5 files changed, 18 insertions(+), 11 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 6fdc1893..14ae6bd4 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -119,9 +119,11 @@ process { ] ] ext.prefix = { "normalised." } - ext.args = { - "--querygse \"${params.querygse}\"".trim() - } + ext.args = {[ + "--querygse \"${params.querygse}\"" + "-- metacols \"${params.features_metadata_cols}\"" + ].join(' ').trim() , + ]} } withName: DESEQ2_DIFFERENTIAL { diff --git a/conf/test_soft_array.config b/conf/test_soft_array.config index c595ea73..dcc9e083 100644 --- a/conf/test_soft_array.config +++ b/conf/test_soft_array.config @@ -30,9 +30,9 @@ params { // Features - features_id_col = 'PROBEID' - features_metadata_cols = 'PROBEID,ENSEMBL,SYMBOL,GENETYPE' - features_name_col = 'SYMBOL' + features_id_col = 'ID' + features_metadata_cols = 'ID,ENTREZ_GENE_ID,Gene Symbol,Sequence Type' + features_name_col = 'Gene Symbol' // Exploratory diff --git a/modules/local/read_from_soft.nf b/modules/local/read_from_soft.nf index 011206df..4dbfcfef 100644 --- a/modules/local/read_from_soft.nf +++ b/modules/local/read_from_soft.nf @@ -8,7 +8,7 @@ process READ_FROM_SOFT { 'quay.io/biocontainers/bioconductor-geoquery:2.66.0--r42hdfd78af_0' }" input: - tuple val(meta), path(samplesheet), val(querygse) + tuple val(meta), path(samplesheet), val(querygse), val(metacols) output: tuple val(meta), path("*.rds") , emit: rds diff --git a/modules/local/templates/read_soft_matrix.R b/modules/local/templates/read_soft_matrix.R index 7bdc97b9..36b0621b 100644 --- a/modules/local/templates/read_soft_matrix.R +++ b/modules/local/templates/read_soft_matrix.R @@ -46,6 +46,7 @@ round_dataframe_columns <- function(df, columns = NULL, digits = 8){ opt <- list( querygse = '$querygse' + metacols = '$features_metadata_cols' ) args_opt <- parse_args('$task.ext.args') for ( ao in names(args_opt)){ @@ -65,8 +66,12 @@ library(GEOquery) # fetch data for GSE number eset <- getGEO(opt\$querygse)[[1]] +# parse metadata columns from nextflow parameters +# to subset on the feature metadata file +feature_cols = strsplit(opt\$metacols,',')[[1]] + # write probeset annotation -write.table(fData(eset)[,c('ID','Entrez_Gene_ID','Symbol','Definition')], +write.table(fData(eset)[,feature_cols], paste0(opt\$querygse,'.annotation.tsv'), col.names=TRUE, row.names=FALSE, sep="\t", quote=FALSE) diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index 88fb9c76..c81fb586 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -24,11 +24,11 @@ if (params.study_type == 'affy_array'){ } // If this is another array platform and user wish to read from SOFT files // then a GSE study identifier must be provided -} else if (params.study_type == 'non_affy_array' && params.querygse != ""){ +} else if (params.study_type == 'non_affy_array' && params.querygse != "" && params.features_metadata_cols != ""){ if (params.querygse) { - ch_querygse = Channel.of([exp_meta, params.querygse]) + ch_querygse = Channel.of([exp_meta, params.querygse, params.features_metadata_cols]) } else { - error("Query GSE not specified!") + error("Query GSE not specified or features metadata columns not specified") } } else { // If this is not microarray data, and this an RNA-seq dataset, From af0518edb102280b07087c61e9ed8d4fc65ca878 Mon Sep 17 00:00:00 2001 From: Azedine Zoufir Date: Thu, 15 Jun 2023 16:04:31 +0100 Subject: [PATCH 23/72] fix typo in config --- conf/modules.config | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 14ae6bd4..920dc1be 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -122,8 +122,7 @@ process { ext.args = {[ "--querygse \"${params.querygse}\"" "-- metacols \"${params.features_metadata_cols}\"" - ].join(' ').trim() , - ]} + ].join(' ').trim() } } withName: DESEQ2_DIFFERENTIAL { From 872023dde40629ae11d576152c65a9fb2d7e24bc Mon Sep 17 00:00:00 2001 From: Azedine Zoufir Date: Thu, 15 Jun 2023 16:09:13 +0100 Subject: [PATCH 24/72] fix typo in config --- conf/modules.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index 920dc1be..f56a69e6 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -120,7 +120,7 @@ process { ] ext.prefix = { "normalised." } ext.args = {[ - "--querygse \"${params.querygse}\"" + "--querygse \"${params.querygse}\"", "-- metacols \"${params.features_metadata_cols}\"" ].join(' ').trim() } } From e7bfdc61aee783173561eb178e6d35e1de6264d1 Mon Sep 17 00:00:00 2001 From: Azedine Zoufir Date: Thu, 15 Jun 2023 16:21:56 +0100 Subject: [PATCH 25/72] fix parameter name in read_soft_matrix --- modules/local/templates/read_soft_matrix.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/templates/read_soft_matrix.R b/modules/local/templates/read_soft_matrix.R index 36b0621b..dbfd22f7 100644 --- a/modules/local/templates/read_soft_matrix.R +++ b/modules/local/templates/read_soft_matrix.R @@ -46,7 +46,7 @@ round_dataframe_columns <- function(df, columns = NULL, digits = 8){ opt <- list( querygse = '$querygse' - metacols = '$features_metadata_cols' + metacols = '$metacols' ) args_opt <- parse_args('$task.ext.args') for ( ao in names(args_opt)){ From 52929b32150489391537d462dce2151e28a5125c Mon Sep 17 00:00:00 2001 From: Azedine Zoufir Date: Thu, 15 Jun 2023 16:46:06 +0100 Subject: [PATCH 26/72] fix parameter name in read_soft_matrix --- modules/local/templates/read_soft_matrix.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/templates/read_soft_matrix.R b/modules/local/templates/read_soft_matrix.R index dbfd22f7..7d6849c1 100644 --- a/modules/local/templates/read_soft_matrix.R +++ b/modules/local/templates/read_soft_matrix.R @@ -45,7 +45,7 @@ round_dataframe_columns <- function(df, columns = NULL, digits = 8){ ############################ opt <- list( - querygse = '$querygse' + querygse = '$querygse', metacols = '$metacols' ) args_opt <- parse_args('$task.ext.args') From 94b9b26a7299171f6545a0fc90699a94fee75125 Mon Sep 17 00:00:00 2001 From: Azedine Zoufir Date: Thu, 15 Jun 2023 17:01:17 +0100 Subject: [PATCH 27/72] change column names to GSM ids in configuration, to match SOFT format --- conf/test_soft_array.config | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/conf/test_soft_array.config b/conf/test_soft_array.config index dcc9e083..a21e1d01 100644 --- a/conf/test_soft_array.config +++ b/conf/test_soft_array.config @@ -25,8 +25,8 @@ params { study_abundance_type = 'intensities' // Observations - observations_id_col = 'name' - observations_name_col = 'name' + observations_id_col = 'id' + observations_name_col = 'id' // Features From ed95ab4e16ed7844aebe9bd2c75c2c68b8f8a739 Mon Sep 17 00:00:00 2001 From: Azedine Zoufir Date: Thu, 15 Jun 2023 17:25:47 +0100 Subject: [PATCH 28/72] update docs --- CHANGELOG.md | 4 ++++ docs/usage.md | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 960f42e5..5235eb24 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## v1.4dev - 2023-05-15 + +- [] - Added support for non-Affymetrix arrays via automatic download of SOFT matrices in GEO + ## v1.3.0dev - [date] ### `Added` diff --git a/docs/usage.md b/docs/usage.md index a0101cac..53a82ba1 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -57,6 +57,10 @@ This is a numeric square matrix file, comma or tab-separated, with a column for This is an archive of CEL files as frequently found in GEO. +### Other microarray platforms + +Alternatively, user may want to work non-Affymetrix array platforms. In this case, setting `--study_type non_affy_array` and `--querygse [GSE study ID]` enables the pipeline to download normalised SOFT matrices automatically (note that even though Affymetrix arrays are also supported in the SOFT matrix track, it is recommended to work from CEL files in this case). + ## Contrasts file ```bash From 7252fcf7ec50cd9aa6dd7a630dbdacd3f67a1380 Mon Sep 17 00:00:00 2001 From: Azedine Zoufir Date: Fri, 16 Jun 2023 12:02:19 +0100 Subject: [PATCH 29/72] update docs --- docs/usage.md | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index 53a82ba1..9b3b4c85 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -57,9 +57,17 @@ This is a numeric square matrix file, comma or tab-separated, with a column for This is an archive of CEL files as frequently found in GEO. -### Other microarray platforms +### Other microarray platforms / SOFT matrices -Alternatively, user may want to work non-Affymetrix array platforms. In this case, setting `--study_type non_affy_array` and `--querygse [GSE study ID]` enables the pipeline to download normalised SOFT matrices automatically (note that even though Affymetrix arrays are also supported in the SOFT matrix track, it is recommended to work from CEL files in this case). +Alternatively, user may want to work non-Affymetrix array platforms. In this case, setting + +`--study_type non_affy_array` and `--querygse [GSE study ID]` + +enables the pipeline to download normalised SOFT matrices automatically (note that even though Affymetrix arrays are also supported in the SOFT matrix track, it is recommended to work from CEL files in this case). Importantly, user must provide a valid set of fields for the features metadata relevant for the platforms e.g. for GPL570 (Affymetrix Plus 2.0 arrays) this could be done with +``` +--features_metadata_cols ID,Entrez_Gene_ID,Symbol,Definition +``` +Full list of features metadata are available on GEO platform pages. ## Contrasts file From c8bd574e4148064bd56726d6e3b746b00d2ca4a8 Mon Sep 17 00:00:00 2001 From: Azedine Zoufir Date: Fri, 16 Jun 2023 14:52:25 +0100 Subject: [PATCH 30/72] NA treatment when determining if log transformation needs to be applied --- modules/local/templates/read_soft_matrix.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/templates/read_soft_matrix.R b/modules/local/templates/read_soft_matrix.R index 7d6849c1..953bcb24 100644 --- a/modules/local/templates/read_soft_matrix.R +++ b/modules/local/templates/read_soft_matrix.R @@ -77,7 +77,7 @@ write.table(fData(eset)[,feature_cols], # if data is not log scale, transform it as needed for limma downstream -if(max(exprs(eset)) > 20) { # a bit dirty, needs proper solution later... +if(max(exprs(eset),na.rm=T) > 20) { # a bit dirty, needs proper solution later... exprs(eset)[exprs(eset) <= 0] <- .001 exprs(eset) <- log2(exprs(eset)) } From 5308d7c6118e178cd3356ae4ac94b4f075a70f4d Mon Sep 17 00:00:00 2001 From: Azedine Zoufir Date: Fri, 16 Jun 2023 15:12:31 +0100 Subject: [PATCH 31/72] NA treatment when determining if log transformation needs to be applied --- modules/local/templates/read_soft_matrix.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/templates/read_soft_matrix.R b/modules/local/templates/read_soft_matrix.R index 953bcb24..936fbafc 100644 --- a/modules/local/templates/read_soft_matrix.R +++ b/modules/local/templates/read_soft_matrix.R @@ -78,7 +78,7 @@ write.table(fData(eset)[,feature_cols], # if data is not log scale, transform it as needed for limma downstream if(max(exprs(eset),na.rm=T) > 20) { # a bit dirty, needs proper solution later... - exprs(eset)[exprs(eset) <= 0] <- .001 + exprs(eset)[exprs(eset) <= 0 | is.na(exprs(eset))] <- .001 exprs(eset) <- log2(exprs(eset)) } From 2dc1a410be01cb99a608a96fcdd73afdcf436928 Mon Sep 17 00:00:00 2001 From: Azedine Zoufir Date: Wed, 28 Jun 2023 13:34:24 +0100 Subject: [PATCH 32/72] fix typo in test prior to differential process --- workflows/differentialabundance.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index c81fb586..f3adc625 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -270,7 +270,7 @@ workflow DIFFERENTIALABUNDANCE { .join(CUSTOM_MATRIXFILTER.out.filtered) // -> meta, samplesheet, filtered matrix .first() - if (params.study_type == 'affy_array' || 'non_affy_array'){ + if (params.study_type == 'affy_array' || params.study_type == 'non_affy_array'){ LIMMA_DIFFERENTIAL ( ch_contrasts, From cfe4bf003054041dd003fca92b75fdbc309444fd Mon Sep 17 00:00:00 2001 From: nf-core-bot Date: Wed, 28 Jun 2023 16:00:04 +0000 Subject: [PATCH 33/72] [automated] Fix linting with Prettier --- docs/usage.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/usage.md b/docs/usage.md index 9b3b4c85..cfcf4ce5 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -64,9 +64,11 @@ Alternatively, user may want to work non-Affymetrix array platforms. In this cas `--study_type non_affy_array` and `--querygse [GSE study ID]` enables the pipeline to download normalised SOFT matrices automatically (note that even though Affymetrix arrays are also supported in the SOFT matrix track, it is recommended to work from CEL files in this case). Importantly, user must provide a valid set of fields for the features metadata relevant for the platforms e.g. for GPL570 (Affymetrix Plus 2.0 arrays) this could be done with + ``` --features_metadata_cols ID,Entrez_Gene_ID,Symbol,Definition ``` + Full list of features metadata are available on GEO platform pages. ## Contrasts file From 738d5bbe54979b98ef5ec54d8dd17606f3427e1e Mon Sep 17 00:00:00 2001 From: Azedine Zoufir Date: Fri, 30 Jun 2023 09:49:44 +0100 Subject: [PATCH 34/72] put changes back into v1.3.0 --- CHANGELOG.md | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5235eb24..9a84186e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,10 +3,6 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## v1.4dev - 2023-05-15 - -- [] - Added support for non-Affymetrix arrays via automatic download of SOFT matrices in GEO - ## v1.3.0dev - [date] ### `Added` @@ -15,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [[#129](https://github.com/nf-core/differentialabundance/pull/129)] - Module updates to fit with recent registry changes ([@pinin4fjords](https://github.com/pinin4fjords), review by [@maxulysse](https://github.com/maxulysse), [@adamrtalbot](https://github.com/adamrtalbot)) - [[#130](https://github.com/nf-core/differentialabundance/pull/130)] - Document reasons for lack of differential expression ([@pinin4fjords](https://github.com/pinin4fjords), review by [@jfy133](https://github.com/jfy133)) - [[#131](https://github.com/nf-core/differentialabundance/pull/131)] - Improve gtf to table configurability ([@pinin4fjords](https://github.com/pinin4fjords), review by [@WackerO](https://github.com/WackerO)) +- [# 136] - Added support for non-Affymetrix arrays via automatic download of SOFT matrices in GEO ### `Fixed` From 544581747acc7ca6937c5e6e22e28a18947a8db2 Mon Sep 17 00:00:00 2001 From: Azedine Zoufir Date: Fri, 30 Jun 2023 09:56:39 +0100 Subject: [PATCH 35/72] replace new study type [non_affy_array] by [geo_soft_file] --- conf/test_soft_array.config | 2 +- docs/usage.md | 2 +- nextflow_schema.json | 2 +- workflows/differentialabundance.nf | 14 +++++++------- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/conf/test_soft_array.config b/conf/test_soft_array.config index a21e1d01..ebeb6c2b 100644 --- a/conf/test_soft_array.config +++ b/conf/test_soft_array.config @@ -21,7 +21,7 @@ params { querygse = 'GSE50790' // Study - study_type = 'non_affy_array' + study_type = 'geo_soft_file' study_abundance_type = 'intensities' // Observations diff --git a/docs/usage.md b/docs/usage.md index cfcf4ce5..206e6b3a 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -61,7 +61,7 @@ This is an archive of CEL files as frequently found in GEO. Alternatively, user may want to work non-Affymetrix array platforms. In this case, setting -`--study_type non_affy_array` and `--querygse [GSE study ID]` +`--study_type geo_soft_file` and `--querygse [GSE study ID]` enables the pipeline to download normalised SOFT matrices automatically (note that even though Affymetrix arrays are also supported in the SOFT matrix track, it is recommended to work from CEL files in this case). Importantly, user must provide a valid set of fields for the features metadata relevant for the platforms e.g. for GPL570 (Affymetrix Plus 2.0 arrays) this could be done with diff --git a/nextflow_schema.json b/nextflow_schema.json index a01c87ff..2f19707e 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -24,7 +24,7 @@ "default": "rnaseq", "description": "A string identifying the technology used to produce the data", "help_text": "Currently 'rnaseq' or 'affy_array' may be specified.", - "enum": ["rnaseq", "affy_array", "non_affy_array"], + "enum": ["rnaseq", "affy_array", "geo_soft_file"], "fa_icon": "far fa-keyboard" }, "input": { diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index f3adc625..c38bff04 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -24,7 +24,7 @@ if (params.study_type == 'affy_array'){ } // If this is another array platform and user wish to read from SOFT files // then a GSE study identifier must be provided -} else if (params.study_type == 'non_affy_array' && params.querygse != "" && params.features_metadata_cols != ""){ +} else if (params.study_type == 'geo_soft_file' && params.querygse != "" && params.features_metadata_cols != ""){ if (params.querygse) { ch_querygse = Channel.of([exp_meta, params.querygse, params.features_metadata_cols]) } else { @@ -147,7 +147,7 @@ workflow DIFFERENTIALABUNDANCE { ch_affy_platform_features = AFFY_JUSTRMA_RAW.out.annotation } - else if(params.study_type == 'non_affy_array'){ + else if(params.study_type == 'geo_soft_file'){ ch_generic_array_input = ch_input .join(ch_querygse) @@ -159,7 +159,7 @@ workflow DIFFERENTIALABUNDANCE { //// Fetch or derive a feature annotation table // If user has provided a feature annotation table, use that - if(params.study_type != 'non_affy_array') { + if(params.study_type != 'geo_soft_file') { if (params.features){ ch_features = Channel.of([ exp_meta, file(params.features, checkIfExists: true)]) } else if (params.study_type == 'affy_array'){ @@ -233,7 +233,7 @@ workflow DIFFERENTIALABUNDANCE { ch_norm = ch_validated_assays.normalised ch_matrix_for_differential = ch_norm } - else if (params.study_type == 'non_affy_array') { + else if (params.study_type == 'geo_soft_file') { ch_norm = VALIDATOR.out.assays ch_matrix_for_differential = ch_norm } @@ -270,7 +270,7 @@ workflow DIFFERENTIALABUNDANCE { .join(CUSTOM_MATRIXFILTER.out.filtered) // -> meta, samplesheet, filtered matrix .first() - if (params.study_type == 'affy_array' || params.study_type == 'non_affy_array'){ + if (params.study_type == 'affy_array' || params.study_type == 'geo_soft_file'){ LIMMA_DIFFERENTIAL ( ch_contrasts, @@ -383,7 +383,7 @@ workflow DIFFERENTIALABUNDANCE { } .unique() - if(params.study_type != "non_affy_array") { + if(params.study_type != "geo_soft_file") { ch_all_matrices = VALIDATOR.out.sample_meta // meta, samples .join(VALIDATOR.out.feature_meta) // meta, samples, features .join(ch_raw) // meta, samples, features, raw matrix @@ -493,7 +493,7 @@ workflow DIFFERENTIALABUNDANCE { // Condition params reported on study type def params_pattern = ~/^(report|study|observations|features|filtering|exploratory|differential|deseq2|gsea).*/ - if (params.study_type == 'affy_array' || 'non_affy_array'){ + if (params.study_type == 'affy_array' || 'geo_soft_file'){ params_pattern = ~/^(report|study|observations|features|filtering|exploratory|differential|affy|limma|gsea).*/ } From 434c8103cd4240f7d9f2c39820c9cbd5db22ad47 Mon Sep 17 00:00:00 2001 From: Azedine Zoufir Date: Fri, 30 Jun 2023 10:49:28 +0100 Subject: [PATCH 36/72] remove [querygse] and [features_metadata_cols] as input to read_soft modules and use as params instead --- modules/local/read_from_soft.nf | 2 +- modules/local/templates/read_soft_matrix.R | 4 ++-- workflows/differentialabundance.nf | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/modules/local/read_from_soft.nf b/modules/local/read_from_soft.nf index 4dbfcfef..ed5c2a9d 100644 --- a/modules/local/read_from_soft.nf +++ b/modules/local/read_from_soft.nf @@ -8,7 +8,7 @@ process READ_FROM_SOFT { 'quay.io/biocontainers/bioconductor-geoquery:2.66.0--r42hdfd78af_0' }" input: - tuple val(meta), path(samplesheet), val(querygse), val(metacols) + tuple val(meta), path(samplesheet) output: tuple val(meta), path("*.rds") , emit: rds diff --git a/modules/local/templates/read_soft_matrix.R b/modules/local/templates/read_soft_matrix.R index 936fbafc..f98180f7 100644 --- a/modules/local/templates/read_soft_matrix.R +++ b/modules/local/templates/read_soft_matrix.R @@ -45,8 +45,8 @@ round_dataframe_columns <- function(df, columns = NULL, digits = 8){ ############################ opt <- list( - querygse = '$querygse', - metacols = '$metacols' + querygse = '', + metacols = '' ) args_opt <- parse_args('$task.ext.args') for ( ao in names(args_opt)){ diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index c38bff04..844fb9aa 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -24,9 +24,9 @@ if (params.study_type == 'affy_array'){ } // If this is another array platform and user wish to read from SOFT files // then a GSE study identifier must be provided -} else if (params.study_type == 'geo_soft_file' && params.querygse != "" && params.features_metadata_cols != ""){ - if (params.querygse) { - ch_querygse = Channel.of([exp_meta, params.querygse, params.features_metadata_cols]) +} else if (params.study_type == 'geo_soft_file'){ + if (params.querygse && params.features_metadata_cols) { + ch_querygse = Channel.of([exp_meta]) } else { error("Query GSE not specified or features metadata columns not specified") } From 718c0b31ec14cb6f78477d2f8643c3d6020bfd31 Mon Sep 17 00:00:00 2001 From: Azedine Zoufir Date: Fri, 30 Jun 2023 11:18:57 +0100 Subject: [PATCH 37/72] fix linting/indents in read_soft_matrix.R --- modules/local/templates/read_soft_matrix.R | 78 +++++++++++----------- 1 file changed, 39 insertions(+), 39 deletions(-) diff --git a/modules/local/templates/read_soft_matrix.R b/modules/local/templates/read_soft_matrix.R index f98180f7..10e7f354 100644 --- a/modules/local/templates/read_soft_matrix.R +++ b/modules/local/templates/read_soft_matrix.R @@ -8,14 +8,14 @@ # @param x Long-form argument list like --opt1 val1 --opt2 val2 # return named list of options and values similar to optparse parse_args <- function(x){ - args_list <- unlist(strsplit(x, ' ?--')[[1]])[-1] - args_vals <- lapply(args_list, function(x) scan(text=x, what='character', quiet = TRUE)) + args_list <- unlist(strsplit(x, ' ?--')[[1]])[-1] + args_vals <- lapply(args_list, function(x) scan(text=x, what='character', quiet = TRUE)) - # Ensure the option vectors are length 2 (key/ value) to catch empty ones - args_vals <- lapply(args_vals, function(z){ length(z) <- 2; z}) + # Ensure the option vectors are length 2 (key/ value) to catch empty ones + args_vals <- lapply(args_vals, function(z){ length(z) <- 2; z}) - parsed_args <- structure(lapply(args_vals, function(x) x[2]), names = lapply(args_vals, function(x) x[1])) - parsed_args[! is.na(parsed_args)] + parsed_args <- structure(lapply(args_vals, function(x) x[2]), names = lapply(args_vals, function(x) x[1])) + parsed_args[! is.na(parsed_args)] } # From affy/justRMA (pinin4fjords) # Round numeric dataframe columns to fixed decimal places by applying @@ -25,19 +25,19 @@ parse_args <- function(x){ # @param digits How many decimal places to round to? # @return output Data frame round_dataframe_columns <- function(df, columns = NULL, digits = 8){ - if (is.null(columns)){ - columns <- colnames(df) - } + if (is.null(columns)){ + columns <- colnames(df) + } - df[,columns] <- format(data.frame(df[, columns]), nsmall = digits) + df[,columns] <- format(data.frame(df[, columns]), nsmall = digits) - # Convert columns back to numeric + # Convert columns back to numeric - for (c in columns) { - df[[c]][grep("^ *NA\$", df[[c]])] <- NA - df[[c]] <- as.numeric(df[[c]]) - } - df + for (c in columns) { + df[[c]][grep("^ *NA\$", df[[c]])] <- NA + df[[c]] <- as.numeric(df[[c]]) + } + df } ############################ @@ -45,16 +45,16 @@ round_dataframe_columns <- function(df, columns = NULL, digits = 8){ ############################ opt <- list( - querygse = '', - metacols = '' + querygse = '', + metacols = '' ) args_opt <- parse_args('$task.ext.args') for ( ao in names(args_opt)){ - if (! ao %in% names(opt)){ - stop(paste("Invalid option:", ao)) - }else{ - opt[[ao]] <- args_opt[[ao]] - } + if (! ao %in% names(opt)){ + stop(paste("Invalid option:", ao)) + }else{ + opt[[ao]] <- args_opt[[ao]] + } } ############################ @@ -78,8 +78,8 @@ write.table(fData(eset)[,feature_cols], # if data is not log scale, transform it as needed for limma downstream if(max(exprs(eset),na.rm=T) > 20) { # a bit dirty, needs proper solution later... - exprs(eset)[exprs(eset) <= 0 | is.na(exprs(eset))] <- .001 - exprs(eset) <- log2(exprs(eset)) + exprs(eset)[exprs(eset) <= 0 | is.na(exprs(eset))] <- .001 + exprs(eset) <- log2(exprs(eset)) } output_prefix <- '$task.ext.prefix' @@ -87,14 +87,14 @@ saveRDS(eset, file = paste0(output_prefix, 'eset.rds')) # write intensity matrix (normalised) write.table( - data.frame( - probe_id = rownames(eset), - round_dataframe_columns(as.data.frame(exprs(eset))), - check.names = FALSE - ), - file = paste0(output_prefix, 'matrix.tsv'), - col.names = TRUE, row.names = FALSE, - sep = '\t', quote = FALSE + data.frame( + probe_id = rownames(eset), + round_dataframe_columns(as.data.frame(exprs(eset))), + check.names = FALSE + ), + file = paste0(output_prefix, 'matrix.tsv'), + col.names = TRUE, row.names = FALSE, + sep = '\t', quote = FALSE ) @@ -111,9 +111,9 @@ r.version <- strsplit(version[['version.string']], ' ')[[1]][3] geoquery.version <- as.character(packageVersion("GEOquery")) writeLines( - c( - '"${task.process}":', - paste(' r-base:', r.version), - paste(' bioconductor-:', geoquery.version) - ), - 'versions.yml') + c( + '"${task.process}":', + paste(' r-base:', r.version), + paste(' bioconductor-:', geoquery.version) + ), + 'versions.yml') From 51b57897e68768c7678b16df05144ec6965ca8b0 Mon Sep 17 00:00:00 2001 From: Azedine Zoufir Date: Fri, 30 Jun 2023 11:23:56 +0100 Subject: [PATCH 38/72] remove usage of 'non affymetrix' in docs and stress that's is a different use case --- docs/usage.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index 206e6b3a..61d3db59 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -57,9 +57,9 @@ This is a numeric square matrix file, comma or tab-separated, with a column for This is an archive of CEL files as frequently found in GEO. -### Other microarray platforms / SOFT matrices +### Use SOFT matrices -Alternatively, user may want to work non-Affymetrix array platforms. In this case, setting +Alternatively, the user may want to work with SOFT matrices. In this case, setting `--study_type geo_soft_file` and `--querygse [GSE study ID]` From bf54aa8617784a9ecbbc398bd8f87951dbdf5337 Mon Sep 17 00:00:00 2001 From: Azedine Zoufir Date: Fri, 30 Jun 2023 11:29:05 +0100 Subject: [PATCH 39/72] name input channel for soft rather than generic --- workflows/differentialabundance.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index ff44041d..9f93a5d2 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -149,10 +149,10 @@ workflow DIFFERENTIALABUNDANCE { } else if(params.study_type == 'geo_soft_file'){ - ch_generic_array_input = ch_input + ch_soft_file_input = ch_input .join(ch_querygse) - READ_FROM_SOFT(ch_generic_array_input) + READ_FROM_SOFT(ch_soft_file_input) ch_in_raw = READ_FROM_SOFT.out.expression ch_features = READ_FROM_SOFT.out.annotation } From 38e7020f3be45ec0ee06d7f653b33ccdf3343281 Mon Sep 17 00:00:00 2001 From: Azedine Zoufir <41290849+azedine-healx@users.noreply.github.com> Date: Fri, 30 Jun 2023 11:31:39 +0100 Subject: [PATCH 40/72] handle features from soft separately from user provided features Co-authored-by: Jonathan Manning --- workflows/differentialabundance.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index 9f93a5d2..e63d3881 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -154,7 +154,7 @@ workflow DIFFERENTIALABUNDANCE { READ_FROM_SOFT(ch_soft_file_input) ch_in_raw = READ_FROM_SOFT.out.expression - ch_features = READ_FROM_SOFT.out.annotation + ch_soft_features = READ_FROM_SOFT.out.annotation } //// Fetch or derive a feature annotation table From 0f41d6f22f433ed776aeb57ca162ed08d1bc228a Mon Sep 17 00:00:00 2001 From: Azedine Zoufir <41290849+azedine-healx@users.noreply.github.com> Date: Fri, 30 Jun 2023 11:32:01 +0100 Subject: [PATCH 41/72] handle features from soft separately from user provided features Co-authored-by: Jonathan Manning --- workflows/differentialabundance.nf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index e63d3881..996842ce 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -164,6 +164,8 @@ workflow DIFFERENTIALABUNDANCE { ch_features = Channel.of([ exp_meta, file(params.features, checkIfExists: true)]) } else if (params.study_type == 'affy_array'){ ch_features = ch_affy_platform_features + } else if(params.study_type == 'geo_soft_file') { + ch_features = ch_soft_features } else if (params.gtf){ // Get feature annotations from a GTF file, gunzip if necessary From 0916ee13703a389560b255de5bd383c99d69df2b Mon Sep 17 00:00:00 2001 From: Azedine Zoufir Date: Fri, 30 Jun 2023 11:37:03 +0100 Subject: [PATCH 42/72] handle features from soft separately from user provided features --- workflows/differentialabundance.nf | 69 +++++++++++++++--------------- 1 file changed, 34 insertions(+), 35 deletions(-) diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index 996842ce..7a4964ec 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -159,47 +159,46 @@ workflow DIFFERENTIALABUNDANCE { //// Fetch or derive a feature annotation table // If user has provided a feature annotation table, use that - if(params.study_type != 'geo_soft_file') { - if (params.features){ - ch_features = Channel.of([ exp_meta, file(params.features, checkIfExists: true)]) - } else if (params.study_type == 'affy_array'){ - ch_features = ch_affy_platform_features - } else if(params.study_type == 'geo_soft_file') { - ch_features = ch_soft_features - } else if (params.gtf){ - // Get feature annotations from a GTF file, gunzip if necessary - - file_gtf_in = file(params.gtf) - file_gtf = [ [ "id": file_gtf_in.simpleName ], file_gtf_in ] - - if ( params.gtf.endsWith('.gz') ){ - GUNZIP_GTF(file_gtf) - file_gtf = GUNZIP_GTF.out.gunzip - ch_versions = ch_versions.mix(GUNZIP_GTF.out.versions) - } + if (params.features){ + ch_features = Channel.of([ exp_meta, file(params.features, checkIfExists: true)]) + } else if (params.study_type == 'affy_array'){ + ch_features = ch_affy_platform_features + } else if(params.study_type == 'geo_soft_file') { + ch_features = ch_soft_features + } else if (params.gtf){ + // Get feature annotations from a GTF file, gunzip if necessary + + file_gtf_in = file(params.gtf) + file_gtf = [ [ "id": file_gtf_in.simpleName ], file_gtf_in ] + + if ( params.gtf.endsWith('.gz') ){ + GUNZIP_GTF(file_gtf) + file_gtf = GUNZIP_GTF.out.gunzip + ch_versions = ch_versions.mix(GUNZIP_GTF.out.versions) + } - // Get a features table from the GTF and combine with the matrix and sample - // annotation (fom = features/ observations/ matrix) + // Get a features table from the GTF and combine with the matrix and sample + // annotation (fom = features/ observations/ matrix) - GTF_TO_TABLE( file_gtf, [[ "id":""], []]) - ch_features = GTF_TO_TABLE.out.feature_annotation - .map{ - tuple( exp_meta, it[1]) - } + GTF_TO_TABLE( file_gtf, [[ "id":""], []]) + ch_features = GTF_TO_TABLE.out.feature_annotation + .map{ + tuple( exp_meta, it[1]) + } - // Record the version of the GTF -> table tool + // Record the version of the GTF -> table tool - ch_versions = ch_versions - .mix(GTF_TO_TABLE.out.versions) - } - else{ + ch_versions = ch_versions + .mix(GTF_TO_TABLE.out.versions) + } + else{ - // Otherwise we can just use the matrix input - matrix_as_anno_filename = "matrix_as_anno.${matrix_file.getExtension()}" - matrix_file.copyTo(matrix_as_anno_filename) - ch_features = Channel.of([ exp_meta, file(matrix_as_anno_filename)]) - } + // Otherwise we can just use the matrix input + matrix_as_anno_filename = "matrix_as_anno.${matrix_file.getExtension()}" + matrix_file.copyTo(matrix_as_anno_filename) + ch_features = Channel.of([ exp_meta, file(matrix_as_anno_filename)]) } + // Channel for the contrasts file ch_contrasts_file = Channel.from([[exp_meta, file(params.contrasts)]]) From b0c46ab56b446a933efca6a0a7f80e753a7b5027 Mon Sep 17 00:00:00 2001 From: Azedine Zoufir Date: Fri, 30 Jun 2023 11:40:39 +0100 Subject: [PATCH 43/72] add mix versions for AFFY_JUSTRMA_RAW, AFFY_JUSTRMA_NORM and READ_FROM_SOFT --- workflows/differentialabundance.nf | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index 7a4964ec..284a81cf 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -146,6 +146,13 @@ workflow DIFFERENTIALABUNDANCE { ch_in_norm = AFFY_JUSTRMA_NORM.out.expression ch_affy_platform_features = AFFY_JUSTRMA_RAW.out.annotation + + ch_versions = ch_versions + .mix(AFFY_JUSTRMA_RAW.out.versions) + + ch_versions = ch_versions + .mix(AFFY_JUSTRMA_NORM.out.versions) + } else if(params.study_type == 'geo_soft_file'){ @@ -155,6 +162,9 @@ workflow DIFFERENTIALABUNDANCE { READ_FROM_SOFT(ch_soft_file_input) ch_in_raw = READ_FROM_SOFT.out.expression ch_soft_features = READ_FROM_SOFT.out.annotation + + ch_versions = ch_versions + .mix(READ_FROM_SOFT.out.versions) } //// Fetch or derive a feature annotation table From 2c0158c441efc005b76b7ccaa0b62eee948b01e9 Mon Sep 17 00:00:00 2001 From: Azedine Zoufir Date: Fri, 30 Jun 2023 12:05:49 +0100 Subject: [PATCH 44/72] make query GSE about downloading SOFT matrices and not about non-affy --- nextflow_schema.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 2f19707e..af30e6bf 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -85,7 +85,7 @@ "querygse": { "type": "string", "default": "None", - "description": "Alternative to CEL archive: the GSE identifier as found in GEO", + "description": "Use SOFT files from GEO by providing the GSE study identifier", "fa_icon": "fas fa-keyboard", "help_text": "Use this option to provide a GSE study identifier." } From 0aee2fad4fee2bde3113c45d7d4948ae15f42458 Mon Sep 17 00:00:00 2001 From: Azedine Zoufir Date: Fri, 30 Jun 2023 12:08:51 +0100 Subject: [PATCH 45/72] change logic to set ch_matrix_for_differential = ch_norm for all non-rnaseq --- workflows/differentialabundance.nf | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index 284a81cf..9d55019b 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -242,10 +242,12 @@ workflow DIFFERENTIALABUNDANCE { } ch_raw = ch_validated_assays.raw ch_norm = ch_validated_assays.normalised - ch_matrix_for_differential = ch_norm } else if (params.study_type == 'geo_soft_file') { ch_norm = VALIDATOR.out.assays + } + + if(params.study_type != 'rnaseq') { ch_matrix_for_differential = ch_norm } else{ From 45ec150be02f1ef66cecdf99d26d99cc413c1117 Mon Sep 17 00:00:00 2001 From: Azedine Zoufir Date: Fri, 30 Jun 2023 12:31:45 +0100 Subject: [PATCH 46/72] simplify processing input to reporting modules --- workflows/differentialabundance.nf | 53 +++++++++++++++++------------- 1 file changed, 31 insertions(+), 22 deletions(-) diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index 9d55019b..f91d2c80 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -397,30 +397,39 @@ workflow DIFFERENTIALABUNDANCE { .unique() if(params.study_type != "geo_soft_file") { - ch_all_matrices = VALIDATOR.out.sample_meta // meta, samples - .join(VALIDATOR.out.feature_meta) // meta, samples, features - .join(ch_raw) // meta, samples, features, raw matrix - .combine(ch_processed_matrices) // meta, samples, features, raw, norm, ... - .map{ - tuple(it[0], it[1], it[2], it[3..it.size()-1]) - } - .first() - } - else { - ch_all_matrices = VALIDATOR.out.sample_meta // meta, samples - .join(VALIDATOR.out.feature_meta) // meta, samples, features - .combine(ch_processed_matrices) // meta, samples, features, norm, ... - .map{ - tuple(it[0], it[1], it[2], it[3..it.size()-1]) - } - .first() + ch_mat = ch_raw.combine(ch_processed_matrices) + }else{ + ch_mat = ch_processed_matrices } -// ch_contrast_variables -// .combine(ch_all_matrices.map{ it.tail() }) -// -// ch_contrast_variables -// .combine(ch_all_matrices.map{ it.tail() }) + ch_all_matrices = VALIDATOR.out.sample_meta // meta, samples + .join(VALIDATOR.out.feature_meta) // meta, samples, features + .join(ch_mat) // meta, samples, features, raw matrix + .combine(ch_processed_matrices) // meta, samples, features, raw, norm, ... + .map{ + tuple(it[0], it[1], it[2], it[3..it.size()-1]) + } + .first() + +// if(params.study_type != "geo_soft_file") { +// ch_all_matrices = VALIDATOR.out.sample_meta // meta, samples +// .join(VALIDATOR.out.feature_meta) // meta, samples, features +// .join(ch_raw) // meta, samples, features, raw matrix +// .combine(ch_processed_matrices) // meta, samples, features, raw, norm, ... +// .map{ +// tuple(it[0], it[1], it[2], it[3..it.size()-1]) +// } +// .first() +// } +// else { +// ch_all_matrices = VALIDATOR.out.sample_meta // meta, samples +// .join(VALIDATOR.out.feature_meta) // meta, samples, features +// .combine(ch_processed_matrices) // meta, samples, features, norm, ... +// .map{ +// tuple(it[0], it[1], it[2], it[3..it.size()-1]) +// } +// .first() +// } PLOT_EXPLORATORY( ch_contrast_variables From 7983d4e71cfecc2f5539afd7b01136f36199188a Mon Sep 17 00:00:00 2001 From: Azedine Zoufir Date: Fri, 30 Jun 2023 12:37:31 +0100 Subject: [PATCH 47/72] fix typo (was combining twice) --- workflows/differentialabundance.nf | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index f91d2c80..6c296fc9 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -404,8 +404,7 @@ workflow DIFFERENTIALABUNDANCE { ch_all_matrices = VALIDATOR.out.sample_meta // meta, samples .join(VALIDATOR.out.feature_meta) // meta, samples, features - .join(ch_mat) // meta, samples, features, raw matrix - .combine(ch_processed_matrices) // meta, samples, features, raw, norm, ... + .join(ch_mat) // meta, samples, features, raw, norm (or just norm) .map{ tuple(it[0], it[1], it[2], it[3..it.size()-1]) } From db3d39d263bba28718926d39b0144d39fa9cb423 Mon Sep 17 00:00:00 2001 From: Azedine Zoufir Date: Fri, 30 Jun 2023 12:43:11 +0100 Subject: [PATCH 48/72] clean up commented code blocks --- workflows/differentialabundance.nf | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index 6c296fc9..e3d46711 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -410,26 +410,6 @@ workflow DIFFERENTIALABUNDANCE { } .first() -// if(params.study_type != "geo_soft_file") { -// ch_all_matrices = VALIDATOR.out.sample_meta // meta, samples -// .join(VALIDATOR.out.feature_meta) // meta, samples, features -// .join(ch_raw) // meta, samples, features, raw matrix -// .combine(ch_processed_matrices) // meta, samples, features, raw, norm, ... -// .map{ -// tuple(it[0], it[1], it[2], it[3..it.size()-1]) -// } -// .first() -// } -// else { -// ch_all_matrices = VALIDATOR.out.sample_meta // meta, samples -// .join(VALIDATOR.out.feature_meta) // meta, samples, features -// .combine(ch_processed_matrices) // meta, samples, features, norm, ... -// .map{ -// tuple(it[0], it[1], it[2], it[3..it.size()-1]) -// } -// .first() -// } - PLOT_EXPLORATORY( ch_contrast_variables .combine(ch_all_matrices.map{ it.tail() }) From d16b9b70d36c5997c9252034fd708ee26e2d86d4 Mon Sep 17 00:00:00 2001 From: Azedine Zoufir Date: Wed, 26 Jul 2023 13:06:32 +0100 Subject: [PATCH 49/72] swap geoquery from local to nf-core module in main workflow --- workflows/differentialabundance.nf | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index 3f976aca..d82f4ada 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -78,8 +78,6 @@ citations_file = file(params.citations_file, checkIfExists: true) */ include { TABULAR_TO_GSEA_CHIP } from '../modules/local/tabular_to_gsea_chip' -include { READ_FROM_SOFT } from '../modules/local/read_from_soft' - /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -108,7 +106,7 @@ include { CUSTOM_TABULARTOGSEACLS } from '../modules/n include { RMARKDOWNNOTEBOOK } from '../modules/nf-core/rmarkdownnotebook/main' include { AFFY_JUSTRMA as AFFY_JUSTRMA_RAW } from '../modules/nf-core/affy/justrma/main' include { AFFY_JUSTRMA as AFFY_JUSTRMA_NORM } from '../modules/nf-core/affy/justrma/main' - +include { GEOQUERY_GETGEO as READ_FROM_SOFT } from '../modules/nf-core/geoquery/getgeo/main' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RUN MAIN WORKFLOW From 6b5a9e714c4b2388343764ed6a694b2136f27df1 Mon Sep 17 00:00:00 2001 From: Azedine Zoufir Date: Wed, 26 Jul 2023 13:07:37 +0100 Subject: [PATCH 50/72] install geoquery/getgeo as nf-core module --- modules/nf-core/geoquery/getgeo/main.nf | 24 +++ modules/nf-core/geoquery/getgeo/meta.yml | 47 +++++ .../geoquery/getgeo/templates/getgeo.R | 171 ++++++++++++++++++ 3 files changed, 242 insertions(+) create mode 100644 modules/nf-core/geoquery/getgeo/main.nf create mode 100644 modules/nf-core/geoquery/getgeo/meta.yml create mode 100644 modules/nf-core/geoquery/getgeo/templates/getgeo.R diff --git a/modules/nf-core/geoquery/getgeo/main.nf b/modules/nf-core/geoquery/getgeo/main.nf new file mode 100644 index 00000000..39d12c26 --- /dev/null +++ b/modules/nf-core/geoquery/getgeo/main.nf @@ -0,0 +1,24 @@ +process GEOQUERY_GETGEO { + tag "$meta.id" + label 'process_single' + + conda "bioconda::bioconductor-geoquery=2.66.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bioconductor-geoquery:2.66.0--r42hdfd78af_0' : + 'biocontainers/bioconductor-geoquery:2.66.0--r42hdfd78af_0' }" + + input: + tuple val(meta), val(querygse) + + output: + tuple val(meta), path("*.rds") , emit: rds + tuple val(meta), path("*matrix.tsv") , emit: expression + tuple val(meta), path("*annotation.tsv") , emit: annotation + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + template 'getgeo.R' +} diff --git a/modules/nf-core/geoquery/getgeo/meta.yml b/modules/nf-core/geoquery/getgeo/meta.yml new file mode 100644 index 00000000..bbacbe04 --- /dev/null +++ b/modules/nf-core/geoquery/getgeo/meta.yml @@ -0,0 +1,47 @@ +name: "geoquery_getgeo" +description: Retrieves GEO data from the Gene Expression Omnibus (GEO) +keywords: + - geo + - expression + - microarray + +tools: + - "geoquery": + description: "Get data from NCBI Gene Expression Omnibus (GEO)" + homepage: "https://bioconductor.org/packages/release/bioc/html/GEOquery.html" + documentation: "https://bioconductor.org/packages/release/bioc/vignettes/GEOquery/inst/doc/GEOquery.html" + tool_dev_url: "https://github.com/seandavi/GEOquery" + doi: "10.1093/bioinformatics/btm254" + licence: "MIT" + +input: + - meta: + type: map + description: | + Groovy Map containing metadata about the GEO dataset, minimally 'id'. + - querygse: + type: string + description: | + GSE identifier to pass to getGEO() + +output: + - rds: + type: file + description: R object containing GEO data + pattern: "*.rds" + - expression: + type: file + description: TSV-format expression matrix + pattern: "*matrix.tsv" + - annotation: + type: file + description: TSV-format annotation file + pattern: "*annotation.tsv" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@azedinez" + - "@pinin4fjords" diff --git a/modules/nf-core/geoquery/getgeo/templates/getgeo.R b/modules/nf-core/geoquery/getgeo/templates/getgeo.R new file mode 100644 index 00000000..99d73e40 --- /dev/null +++ b/modules/nf-core/geoquery/getgeo/templates/getgeo.R @@ -0,0 +1,171 @@ +#!/usr/bin/env Rscript + +################################################ +################################################ +## Functions ## +################################################ +################################################ + +#' Parse out options from a string without recourse to optparse +#' +#' @param x Long-form argument list like --opt1 val1 --opt2 val2 +#' +#' @return named list of options and values similar to optparse + +parse_args <- function(x){ + args_list <- unlist(strsplit(x, ' ?--')[[1]])[-1] + args_vals <- lapply(args_list, function(x) scan(text=x, what='character', quiet = TRUE)) + + # Ensure the option vectors are length 2 (key/ value) to catch empty ones + args_vals <- lapply(args_vals, function(z){ length(z) <- 2; z}) + + parsed_args <- structure(lapply(args_vals, function(x) x[2]), names = lapply(args_vals, function(x) x[1])) + parsed_args[! is.na(parsed_args)] +} + +#' Round numeric dataframe columns to fixed decimal places by applying +#' formatting and converting back to numerics +#' +#' @param dataframe A data frame +#' @param columns Which columns to round (assumes all of them by default) +#' @param digits How many decimal places to round to? +#' +#' @return output Data frame + +round_dataframe_columns <- function(df, columns = NULL, digits = 8){ + if (is.null(columns)){ + columns <- colnames(df) + } + + df[,columns] <- format( + data.frame(df[, columns], check.names = FALSE), + nsmall = digits + ) + + # Convert columns back to numeric + + for (c in columns) { + df[[c]][grep("^ *NA\$", df[[c]])] <- NA + df[[c]] <- as.numeric(df[[c]]) + } + df +} + +################################################ +################################################ +## PARSE PARAMETERS FROM NEXTFLOW ## +################################################ +################################################ + +opt <- list( + querygse = '$querygse', + metacols = NULL +) +args_opt <- parse_args('$task.ext.args') +for ( ao in names(args_opt)){ + if (! ao %in% names(opt)){ + stop(paste("Invalid option:", ao)) + }else{ + opt[[ao]] <- args_opt[[ao]] + } +} + +################################################ +################################################ +## Finish loading libraries ## +################################################ +################################################ + +library(GEOquery) + +################################################ +################################################ +## Do the GEO query retrieval ## +################################################ +################################################ + +# Fetch data for GSE number + +eset <- getGEO( + GEO = opt\$querygse, + destdir = getwd() +)[[1]] + +# Write probeset annotation. If supplied, Parse metadata columns from nextflow +# parameters to subset on the feature metadata file + +probeset_annotation = fData(eset) +if (! is.null(opt\$metacols)){ + feature_cols = strsplit(opt\$metacols,',')[[1]] + probeset_annotation <- probeset_annotation[,feature_cols] +} + +################################################ +################################################ +## Generate outputs ## +################################################ +################################################ + +output_prefix <- ifelse('$task.ext.prefix' == 'null', '', '$task.ext.prefix') + +write.table( + probeset_annotation, + paste0(output_prefix,'annotation.tsv'), + col.names=TRUE, + row.names=FALSE, + sep="\t", + quote=FALSE +) + +# If data is not log scale, transform it as needed for limma downstream + +if(max(exprs(eset),na.rm=T) > 20) { # a bit dirty, needs proper solution later... + exprs(eset) <- log2(exprs(eset) + 1) +} + +saveRDS(eset, file = paste0(output_prefix, 'eset.rds')) + +# Write intensity matrix (normalised) + +write.table( + data.frame( + probe_id = rownames(eset), + round_dataframe_columns(as.data.frame(exprs(eset))), + check.names = FALSE + ), + file = paste0(output_prefix, 'matrix.tsv'), + col.names = TRUE, row.names = FALSE, + sep = '\t', quote = FALSE +) + +################################################ +################################################ +## R SESSION INFO ## +################################################ +################################################ + +sink(paste(output_prefix, "R_sessionInfo.log", sep = '.')) +print(sessionInfo()) +sink() + +################################################ +################################################ +## VERSIONS FILE ## +################################################ +################################################ + +r.version <- strsplit(version[['version.string']], ' ')[[1]][3] +geoquery.version <- as.character(packageVersion("GEOquery")) + +writeLines( + c( + '"${task.process}":', + paste(' r-base:', r.version), + paste(' bioconductor-geoquery:', geoquery.version) + ), + 'versions.yml') + +################################################ +################################################ +################################################ +################################################ From 3e579e3d0bb934dd38ad4c8d04a74147957ee1ab Mon Sep 17 00:00:00 2001 From: Azedine Zoufir Date: Wed, 26 Jul 2023 13:08:22 +0100 Subject: [PATCH 51/72] add geoquery in modules.json --- modules.json | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/modules.json b/modules.json index ba265fbf..fac87741 100644 --- a/modules.json +++ b/modules.json @@ -40,6 +40,11 @@ "git_sha": "d0b4fc03af52a1cc8c6fb4493b921b57352b1dd8", "installed_by": ["modules"] }, + "geoquery/getgeo": { + "branch": "master", + "git_sha": "6814b0659c51e447684a58c2b834a9f3b530540d", + "installed_by": ["modules"] + }, "gsea/gsea": { "branch": "master", "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", From 3fa023e76761a9896c3e97abaff25a82e7303177 Mon Sep 17 00:00:00 2001 From: Azedine Zoufir Date: Fri, 25 Aug 2023 15:51:49 +0100 Subject: [PATCH 52/72] remove usage of alias for GEOQUERY_GETGEO --- workflows/differentialabundance.nf | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index d82f4ada..ae42046d 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -106,7 +106,7 @@ include { CUSTOM_TABULARTOGSEACLS } from '../modules/n include { RMARKDOWNNOTEBOOK } from '../modules/nf-core/rmarkdownnotebook/main' include { AFFY_JUSTRMA as AFFY_JUSTRMA_RAW } from '../modules/nf-core/affy/justrma/main' include { AFFY_JUSTRMA as AFFY_JUSTRMA_NORM } from '../modules/nf-core/affy/justrma/main' -include { GEOQUERY_GETGEO as READ_FROM_SOFT } from '../modules/nf-core/geoquery/getgeo/main' +include { GEOQUERY_GETGEO } from '../modules/nf-core/geoquery/getgeo/main' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RUN MAIN WORKFLOW @@ -164,12 +164,12 @@ workflow DIFFERENTIALABUNDANCE { ch_soft_file_input = ch_input .join(ch_querygse) - READ_FROM_SOFT(ch_soft_file_input) - ch_in_raw = READ_FROM_SOFT.out.expression - ch_soft_features = READ_FROM_SOFT.out.annotation + GEOQUERY_GETGEO(ch_soft_file_input) + ch_in_raw = GEOQUERY_GETGEO.out.expression + ch_soft_features = GEOQUERY_GETGEO.out.annotation ch_versions = ch_versions - .mix(READ_FROM_SOFT.out.versions) + .mix(GEOQUERY_GETGEO.out.versions) } //// Fetch or derive a feature annotation table From 6da3caca1bc4bad220d7f34c6a22d8a641125ed5 Mon Sep 17 00:00:00 2001 From: Azedine Zoufir Date: Fri, 25 Aug 2023 15:53:48 +0100 Subject: [PATCH 53/72] remove read_from_soft files from local folder --- modules/local/read_from_soft.nf | 24 ----- modules/local/templates/read_soft_matrix.R | 119 --------------------- 2 files changed, 143 deletions(-) delete mode 100644 modules/local/read_from_soft.nf delete mode 100644 modules/local/templates/read_soft_matrix.R diff --git a/modules/local/read_from_soft.nf b/modules/local/read_from_soft.nf deleted file mode 100644 index ed5c2a9d..00000000 --- a/modules/local/read_from_soft.nf +++ /dev/null @@ -1,24 +0,0 @@ -process READ_FROM_SOFT { - tag "$meta.id" - label 'process_single' - - conda "bioconda::bioconductor-geoquery=2.66.0" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/bioconductor-geoquery:2.66.0--r42hdfd78af_0' : - 'quay.io/biocontainers/bioconductor-geoquery:2.66.0--r42hdfd78af_0' }" - - input: - tuple val(meta), path(samplesheet) - - output: - tuple val(meta), path("*.rds") , emit: rds - tuple val(meta), path("*matrix.tsv") , emit: expression - tuple val(meta), path("*.annotation.tsv") , emit: annotation - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - template 'read_soft_matrix.R' -} diff --git a/modules/local/templates/read_soft_matrix.R b/modules/local/templates/read_soft_matrix.R deleted file mode 100644 index 10e7f354..00000000 --- a/modules/local/templates/read_soft_matrix.R +++ /dev/null @@ -1,119 +0,0 @@ -#!/usr/bin/env Rscript - -############################ -# FUNCTIONS -############################ -# From affy/justRMA (pinin4fjords) -# Parse out options from a string without recourse to optparse -# @param x Long-form argument list like --opt1 val1 --opt2 val2 -# return named list of options and values similar to optparse -parse_args <- function(x){ - args_list <- unlist(strsplit(x, ' ?--')[[1]])[-1] - args_vals <- lapply(args_list, function(x) scan(text=x, what='character', quiet = TRUE)) - - # Ensure the option vectors are length 2 (key/ value) to catch empty ones - args_vals <- lapply(args_vals, function(z){ length(z) <- 2; z}) - - parsed_args <- structure(lapply(args_vals, function(x) x[2]), names = lapply(args_vals, function(x) x[1])) - parsed_args[! is.na(parsed_args)] -} -# From affy/justRMA (pinin4fjords) -# Round numeric dataframe columns to fixed decimal places by applying -# formatting and converting back to numerics -# @param dataframe A data frame -# @param columns Which columns to round (assumes all of them by default) -# @param digits How many decimal places to round to? -# @return output Data frame -round_dataframe_columns <- function(df, columns = NULL, digits = 8){ - if (is.null(columns)){ - columns <- colnames(df) - } - - df[,columns] <- format(data.frame(df[, columns]), nsmall = digits) - - # Convert columns back to numeric - - for (c in columns) { - df[[c]][grep("^ *NA\$", df[[c]])] <- NA - df[[c]] <- as.numeric(df[[c]]) - } - df -} - -############################ -# PARSE PARAMS FROM NEXTFLOW -############################ - -opt <- list( - querygse = '', - metacols = '' -) -args_opt <- parse_args('$task.ext.args') -for ( ao in names(args_opt)){ - if (! ao %in% names(opt)){ - stop(paste("Invalid option:", ao)) - }else{ - opt[[ao]] <- args_opt[[ao]] - } -} - -############################ -# MAIN -############################ - -library(GEOquery) - -# fetch data for GSE number -eset <- getGEO(opt\$querygse)[[1]] - -# parse metadata columns from nextflow parameters -# to subset on the feature metadata file -feature_cols = strsplit(opt\$metacols,',')[[1]] - -# write probeset annotation -write.table(fData(eset)[,feature_cols], - paste0(opt\$querygse,'.annotation.tsv'), - col.names=TRUE, row.names=FALSE, sep="\t", quote=FALSE) - - -# if data is not log scale, transform it as needed for limma downstream -if(max(exprs(eset),na.rm=T) > 20) { # a bit dirty, needs proper solution later... - exprs(eset)[exprs(eset) <= 0 | is.na(exprs(eset))] <- .001 - exprs(eset) <- log2(exprs(eset)) -} - -output_prefix <- '$task.ext.prefix' -saveRDS(eset, file = paste0(output_prefix, 'eset.rds')) - -# write intensity matrix (normalised) -write.table( - data.frame( - probe_id = rownames(eset), - round_dataframe_columns(as.data.frame(exprs(eset))), - check.names = FALSE - ), - file = paste0(output_prefix, 'matrix.tsv'), - col.names = TRUE, row.names = FALSE, - sep = '\t', quote = FALSE -) - - -############################ -# LOG SESSION AND VERSIONS -############################ - - -sink("R_sessionInfo.log") -print(sessionInfo()) -sink() - -r.version <- strsplit(version[['version.string']], ' ')[[1]][3] -geoquery.version <- as.character(packageVersion("GEOquery")) - -writeLines( - c( - '"${task.process}":', - paste(' r-base:', r.version), - paste(' bioconductor-:', geoquery.version) - ), - 'versions.yml') From db1ada5b6ff84089f9bea8793b75f1cf95ce531d Mon Sep 17 00:00:00 2001 From: Azedine Zoufir <41290849+azedinez@users.noreply.github.com> Date: Fri, 25 Aug 2023 15:55:15 +0100 Subject: [PATCH 54/72] fix spacing in conf/modules.config Co-authored-by: Jonathan Manning --- conf/modules.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index 1916c7d1..4c07da9e 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -121,7 +121,7 @@ process { ext.prefix = { "normalised." } ext.args = {[ "--querygse \"${params.querygse}\"", - "-- metacols \"${params.features_metadata_cols}\"" + "--metacols \"${params.features_metadata_cols}\"" ].join(' ').trim() } } From eaf597f57d3141e8a327b36d25452c09b73d723e Mon Sep 17 00:00:00 2001 From: Azedine Zoufir <41290849+azedinez@users.noreply.github.com> Date: Fri, 25 Aug 2023 15:55:49 +0100 Subject: [PATCH 55/72] remove querygse as external arg in conf/modules.config Co-authored-by: Jonathan Manning --- conf/modules.config | 1 - 1 file changed, 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index 4c07da9e..c7f07597 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -120,7 +120,6 @@ process { ] ext.prefix = { "normalised." } ext.args = {[ - "--querygse \"${params.querygse}\"", "--metacols \"${params.features_metadata_cols}\"" ].join(' ').trim() } } From 4a419e45e67a6102023be967c6aadde5afa7075f Mon Sep 17 00:00:00 2001 From: Azedine Zoufir <41290849+azedinez@users.noreply.github.com> Date: Fri, 25 Aug 2023 15:57:03 +0100 Subject: [PATCH 56/72] add querygse as input for channel workflows/differentialabundance.nf Co-authored-by: Jonathan Manning --- workflows/differentialabundance.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index ae42046d..625b565b 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -32,7 +32,7 @@ if (params.study_type == 'affy_array'){ // then a GSE study identifier must be provided } else if (params.study_type == 'geo_soft_file'){ if (params.querygse && params.features_metadata_cols) { - ch_querygse = Channel.of([exp_meta]) + ch_querygse = Channel.of([[exp_meta], params.querygse]) } else { error("Query GSE not specified or features metadata columns not specified") } From 2f4f6fc717dec12bf7281f7583f97084b28b0d9b Mon Sep 17 00:00:00 2001 From: Azedine Zoufir Date: Fri, 25 Aug 2023 16:02:12 +0100 Subject: [PATCH 57/72] revert to using ch_query_gse as input channel for GEOQUERY_GETGEO process --- workflows/differentialabundance.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index 625b565b..e98b60c4 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -164,7 +164,7 @@ workflow DIFFERENTIALABUNDANCE { ch_soft_file_input = ch_input .join(ch_querygse) - GEOQUERY_GETGEO(ch_soft_file_input) + GEOQUERY_GETGEO(ch_query_gse) ch_in_raw = GEOQUERY_GETGEO.out.expression ch_soft_features = GEOQUERY_GETGEO.out.annotation From 843c65394c6906264b053fda867be7b05a58dcbb Mon Sep 17 00:00:00 2001 From: Azedine Zoufir <41290849+azedinez@users.noreply.github.com> Date: Fri, 25 Aug 2023 16:03:30 +0100 Subject: [PATCH 58/72] delete join on ch_querygse in workflows/differentialabundance.nf Co-authored-by: Jonathan Manning --- workflows/differentialabundance.nf | 3 --- 1 file changed, 3 deletions(-) diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index e98b60c4..a31f028f 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -161,9 +161,6 @@ workflow DIFFERENTIALABUNDANCE { } else if(params.study_type == 'geo_soft_file'){ - ch_soft_file_input = ch_input - .join(ch_querygse) - GEOQUERY_GETGEO(ch_query_gse) ch_in_raw = GEOQUERY_GETGEO.out.expression ch_soft_features = GEOQUERY_GETGEO.out.annotation From b782445d9d8fa45a0b9a5727d7268822c5c72ed4 Mon Sep 17 00:00:00 2001 From: Azedine Zoufir <41290849+azedinez@users.noreply.github.com> Date: Fri, 25 Aug 2023 16:04:21 +0100 Subject: [PATCH 59/72] delete mixing versions for JUSTRMA_NORM Co-authored-by: Jonathan Manning --- workflows/differentialabundance.nf | 3 --- 1 file changed, 3 deletions(-) diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index a31f028f..f5320b04 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -155,9 +155,6 @@ workflow DIFFERENTIALABUNDANCE { ch_versions = ch_versions .mix(AFFY_JUSTRMA_RAW.out.versions) - ch_versions = ch_versions - .mix(AFFY_JUSTRMA_NORM.out.versions) - } else if(params.study_type == 'geo_soft_file'){ From dc1c33768c19f2f46b1052f2bced53df2f574e6f Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Mon, 28 Aug 2023 09:36:45 +0100 Subject: [PATCH 60/72] Correctly refer to matrices from getgeo as norm --- workflows/differentialabundance.nf | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index f5320b04..b6a498cf 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -28,9 +28,10 @@ if (params.study_type == 'affy_array'){ } else { error("CEL files archive not specified!") } - // If this is another array platform and user wish to read from SOFT files - // then a GSE study identifier must be provided } else if (params.study_type == 'geo_soft_file'){ + + // To pull SOFT files from a GEO a GSE study identifer must be provided + if (params.querygse && params.features_metadata_cols) { ch_querygse = Channel.of([[exp_meta], params.querygse]) } else { @@ -39,6 +40,7 @@ if (params.study_type == 'affy_array'){ } else { // If this is not microarray data, and this an RNA-seq dataset, // then assume we're reading from a matrix + if (params.study_type == "rnaseq" && params.matrix) { matrix_file = file(params.matrix, checkIfExists: true) ch_in_raw = Channel.of([ exp_meta, matrix_file]) @@ -159,7 +161,7 @@ workflow DIFFERENTIALABUNDANCE { else if(params.study_type == 'geo_soft_file'){ GEOQUERY_GETGEO(ch_query_gse) - ch_in_raw = GEOQUERY_GETGEO.out.expression + ch_in_norm = GEOQUERY_GETGEO.out.expression ch_soft_features = GEOQUERY_GETGEO.out.annotation ch_versions = ch_versions @@ -219,6 +221,9 @@ workflow DIFFERENTIALABUNDANCE { .join(ch_in_norm) .map{tuple(it[0], [it[1], it[2]])} } + else if (params.study_type == 'geo_soft_file'){ + ch_matrices_for_validation = ch_in_norm + } else{ ch_matrices_for_validation = ch_in_raw } From fed96cce12a7b34d399ce3fe48a96728091a4dcd Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Mon, 28 Aug 2023 10:17:11 +0100 Subject: [PATCH 61/72] Make sure supplying metadata cols is optional --- assets/differentialabundance_report.Rmd | 4 +++- conf/modules.config | 6 +++--- docs/usage.md | 4 +++- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/assets/differentialabundance_report.Rmd b/assets/differentialabundance_report.Rmd index 397d7e01..c6b2fbab 100644 --- a/assets/differentialabundance_report.Rmd +++ b/assets/differentialabundance_report.Rmd @@ -210,7 +210,9 @@ if (! params$observations_name_col %in% colnames(observations)){ if (! is.null(params$features)){ features <- read_metadata(file.path(params$input_dir, params$features)) - features <- features[,colnames(features) %in% simpleSplit(params$features_metadata_cols), drop = FALSE] + if (! is.null(params$features_metadata_cols)){ + features <- features[,colnames(features) %in% simpleSplit(params$features_metadata_cols), drop = FALSE] + } } contrasts <- read_metadata(file.path(params$input_dir, params$contrasts_file)) diff --git a/conf/modules.config b/conf/modules.config index c7f07597..a3b1b109 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -119,9 +119,9 @@ process { ] ] ext.prefix = { "normalised." } - ext.args = {[ - "--metacols \"${params.features_metadata_cols}\"" - ].join(' ').trim() } + ext.args = { + ((params.features_metadata_cols == null) ? '' : "--metacols \"${params.features_metadata_cols}\"") + } } withName: DESEQ2_NORM { diff --git a/docs/usage.md b/docs/usage.md index b2d7517e..f8799325 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -63,7 +63,9 @@ Alternatively, the user may want to work with SOFT matrices. In this case, setti `--study_type geo_soft_file` and `--querygse [GSE study ID]` -enables the pipeline to download normalised SOFT matrices automatically (note that even though Affymetrix arrays are also supported in the SOFT matrix track, it is recommended to work from CEL files in this case). Importantly, user must provide a valid set of fields for the features metadata relevant for the platforms e.g. for GPL570 (Affymetrix Plus 2.0 arrays) this could be done with +enables the pipeline to download normalised SOFT matrices automatically (note that even though Affymetrix arrays are also supported in the SOFT matrix track, it is recommended to work from CEL files in this case). + +As for other platforms You may subset the metadata features used in reporting etc. e.g. for GPL570 (Affymetrix Plus 2.0 arrays) this could be done with ``` --features_metadata_cols ID,Entrez_Gene_ID,Symbol,Definition From 62feb145ef034beffe498606869a368a0fa16532 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Mon, 28 Aug 2023 10:17:48 +0100 Subject: [PATCH 62/72] Sort out config for SOFT --- .github/workflows/ci.yml | 1 + conf/{test_soft_array.config => soft.config} | 11 +++----- conf/test_soft.config | 27 ++++++++++++++++++++ nextflow.config | 2 +- 4 files changed, 32 insertions(+), 9 deletions(-) rename conf/{test_soft_array.config => soft.config} (71%) create mode 100644 conf/test_soft.config diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6019f4eb..e1a5b997 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -30,6 +30,7 @@ jobs: - "test" - "test_nogtf" - "test_affy" + - "test_soft" steps: - name: Check out pipeline code uses: actions/checkout@v3 diff --git a/conf/test_soft_array.config b/conf/soft.config similarity index 71% rename from conf/test_soft_array.config rename to conf/soft.config index ebeb6c2b..31d09ec2 100644 --- a/conf/test_soft_array.config +++ b/conf/soft.config @@ -1,11 +1,11 @@ /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Nextflow config file for running affymetrix array analysis + Nextflow config file for running SOFT array file analysis ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Defines settings specific to affy array analysis + Defines settings specific to array analysis with SOFT files from GEO Use as follows: - nextflow run nf-core/differentialabundance -profile --outdir + nextflow run nf-core/differentialabundance -profile soft, --outdir ---------------------------------------------------------------------------------------- */ @@ -15,11 +15,6 @@ params { config_profile_name = 'SOFT matrix track test profile' config_profile_description = 'Minimal settings for test of the SOFT matrix track' - // Input - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/differentialabundance/testdata/GSE50790.csv' - contrasts = 'https://raw.githubusercontent.com/nf-core/test-datasets/differentialabundance/testdata/GSE50790_contrasts.csv' - querygse = 'GSE50790' - // Study study_type = 'geo_soft_file' study_abundance_type = 'intensities' diff --git a/conf/test_soft.config b/conf/test_soft.config new file mode 100644 index 00000000..fa501803 --- /dev/null +++ b/conf/test_soft.config @@ -0,0 +1,27 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple + pipeline test with SOFT array files from GEO. + + Use as follows: + nextflow run nf-core/differentialabundance -profile test_soft_array, --outdir + +---------------------------------------------------------------------------------------- +*/ + +includeConfig 'soft_array.config' + +params { + + config_profile_name = 'SOFT matrix track test profile' + config_profile_description = 'Minimal settings for test of the SOFT matrix track' + + // Input + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/differentialabundance/testdata/GSE50790.csv' + contrasts = 'https://raw.githubusercontent.com/nf-core/test-datasets/differentialabundance/testdata/GSE50790_contrasts.csv' + querygse = 'GSE50790' + +} + diff --git a/nextflow.config b/nextflow.config index 4b04f5ea..e47663b5 100644 --- a/nextflow.config +++ b/nextflow.config @@ -311,7 +311,7 @@ profiles { affy { includeConfig 'conf/affy.config' } rnaseq { includeConfig 'conf/rnaseq.config' } test_affy { includeConfig 'conf/test_affy.config' } - test_soft {includeConfig 'conf/test_soft_array.config' } + test_soft {includeConfig 'conf/test_soft.config' } } // Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile From 248756ccc0745524a8d91c9b85a6db08fa7501e0 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Mon, 28 Aug 2023 10:19:15 +0100 Subject: [PATCH 63/72] prettier --- docs/usage.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/usage.md b/docs/usage.md index f8799325..0f394a15 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -63,7 +63,7 @@ Alternatively, the user may want to work with SOFT matrices. In this case, setti `--study_type geo_soft_file` and `--querygse [GSE study ID]` -enables the pipeline to download normalised SOFT matrices automatically (note that even though Affymetrix arrays are also supported in the SOFT matrix track, it is recommended to work from CEL files in this case). +enables the pipeline to download normalised SOFT matrices automatically (note that even though Affymetrix arrays are also supported in the SOFT matrix track, it is recommended to work from CEL files in this case). As for other platforms You may subset the metadata features used in reporting etc. e.g. for GPL570 (Affymetrix Plus 2.0 arrays) this could be done with From 281a18e68b5ad6773237b7afc7a1e6613c8a590c Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Mon, 28 Aug 2023 10:25:48 +0100 Subject: [PATCH 64/72] Fix soft config import --- conf/test_soft.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/test_soft.config b/conf/test_soft.config index fa501803..880f22e6 100644 --- a/conf/test_soft.config +++ b/conf/test_soft.config @@ -11,7 +11,7 @@ ---------------------------------------------------------------------------------------- */ -includeConfig 'soft_array.config' +includeConfig 'soft.config' params { From 430f5301c8fd4395b613f1b3bebe9383c5f13efc Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Mon, 28 Aug 2023 10:28:34 +0100 Subject: [PATCH 65/72] Fix channel name --- workflows/differentialabundance.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index b6a498cf..a57a1529 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -160,7 +160,7 @@ workflow DIFFERENTIALABUNDANCE { } else if(params.study_type == 'geo_soft_file'){ - GEOQUERY_GETGEO(ch_query_gse) + GEOQUERY_GETGEO(ch_querygse) ch_in_norm = GEOQUERY_GETGEO.out.expression ch_soft_features = GEOQUERY_GETGEO.out.annotation From 33389fc0b6f65e25b6fff94a70e87961022bc842 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Mon, 28 Aug 2023 10:33:30 +0100 Subject: [PATCH 66/72] Fix config selector --- conf/modules.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/modules.config b/conf/modules.config index a3b1b109..adb2f584 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -100,7 +100,7 @@ process { ].join(' ').trim() } } - withName: READ_FROM_SOFT { + withName: GEOQUERY_GETGEO { publishDir = [ [ path: { "${params.outdir}/tables/processed_abundance" }, From cfaf8ebef9cd5dcf4b3aa9c07c2ddc19278fce04 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Mon, 28 Aug 2023 10:57:02 +0100 Subject: [PATCH 67/72] Fix channel definition --- workflows/differentialabundance.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index a57a1529..77cb423d 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -33,7 +33,7 @@ if (params.study_type == 'affy_array'){ // To pull SOFT files from a GEO a GSE study identifer must be provided if (params.querygse && params.features_metadata_cols) { - ch_querygse = Channel.of([[exp_meta], params.querygse]) + ch_querygse = Channel.of([exp_meta, params.querygse]) } else { error("Query GSE not specified or features metadata columns not specified") } From 1c842bb1245af919689e9ccd9342726f59be08d3 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Mon, 28 Aug 2023 10:58:25 +0100 Subject: [PATCH 68/72] Fix CHANGELOG --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 63779a62..feae94dc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,7 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [[#129](https://github.com/nf-core/differentialabundance/pull/129)] - Module updates to fit with recent registry changes ([@pinin4fjords](https://github.com/pinin4fjords), review by [@maxulysse](https://github.com/maxulysse), [@adamrtalbot](https://github.com/adamrtalbot)) - [[#130](https://github.com/nf-core/differentialabundance/pull/130)] - Document reasons for lack of differential expression ([@pinin4fjords](https://github.com/pinin4fjords), review by [@jfy133](https://github.com/jfy133)) - [[#131](https://github.com/nf-core/differentialabundance/pull/131)] - Improve gtf to table configurability ([@pinin4fjords](https://github.com/pinin4fjords), review by [@WackerO](https://github.com/WackerO)) -- [# 136](https://github.com/nf-core/differentialabundance/pull/136)] - Added support for non-Affymetrix arrays via automatic download of SOFT matrices in GEO +- [# 136](https://github.com/nf-core/differentialabundance/pull/136)] - Added support for non-Affymetrix arrays via automatic download of SOFT matrices in GEO ([@azedinez](https://github.com/azedinez), review by [@pinin4fjords](https://github.com/pinin4fjords)) - [[#137](https://github.com/nf-core/differentialabundance/pull/137)] - Add `--sizefactors_from_controls` and `--gene_id_col` for DESeq2 module to modules.config ([@WackerO](https://github.com/WackerO), review by [@pinin4fjords](https://github.com/pinin4fjords)) - [[#145](https://github.com/nf-core/differentialabundance/pull/145)] - Template update for nf-core/tools v2.9 ([@nf-core-bot](https://github.com/nf-core-bot), review by [@pinin4fjords](https://github.com/pinin4fjords), [@WackerO](https://github.com/WackerO)) From 979085977481a3d7016bfba60b4dc91345a8e257 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Mon, 28 Aug 2023 11:36:23 +0100 Subject: [PATCH 69/72] Final fixes --- workflows/differentialabundance.nf | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index 77cb423d..c407dfcd 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -304,9 +304,7 @@ workflow DIFFERENTIALABUNDANCE { } else{ - // - - DESEQ2_NORM ( + DESEQ2_NORM ( ch_contrasts.first(), ch_samples_and_matrix, ch_control_features @@ -408,13 +406,16 @@ workflow DIFFERENTIALABUNDANCE { } .unique() - if(params.study_type != "geo_soft_file") { - ch_mat = ch_raw.combine(ch_processed_matrices) + // For geoquery we've done no matrix processing and been supplied with the + // normalised matrix, which can be passed through to downstream analysis + + if(params.study_type == "geo_soft_file") { + ch_mat = ch_norm }else{ - ch_mat = ch_processed_matrices + ch_mat = ch_raw.combine(ch_processed_matrices) } - ch_all_matrices = VALIDATOR.out.sample_meta // meta, samples + ch_all_matrices = VALIDATOR.out.sample_meta // meta, samples .join(VALIDATOR.out.feature_meta) // meta, samples, features .join(ch_mat) // meta, samples, features, raw, norm (or just norm) .map{ From abed79781851f3feb7218bcb10ac6af70fda1bd2 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Mon, 28 Aug 2023 11:38:49 +0100 Subject: [PATCH 70/72] limit resources for testing --- conf/test_soft.config | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/conf/test_soft.config b/conf/test_soft.config index 880f22e6..1fc21677 100644 --- a/conf/test_soft.config +++ b/conf/test_soft.config @@ -18,6 +18,11 @@ params { config_profile_name = 'SOFT matrix track test profile' config_profile_description = 'Minimal settings for test of the SOFT matrix track' + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + // Input input = 'https://raw.githubusercontent.com/nf-core/test-datasets/differentialabundance/testdata/GSE50790.csv' contrasts = 'https://raw.githubusercontent.com/nf-core/test-datasets/differentialabundance/testdata/GSE50790_contrasts.csv' From 823b4cd2055cc3205152d641e60134d616f0619e Mon Sep 17 00:00:00 2001 From: Azedine Zoufir Date: Mon, 11 Sep 2023 16:17:03 +0100 Subject: [PATCH 71/72] add soft config --- nextflow.config | 1 + 1 file changed, 1 insertion(+) diff --git a/nextflow.config b/nextflow.config index e47663b5..89986f6e 100644 --- a/nextflow.config +++ b/nextflow.config @@ -310,6 +310,7 @@ profiles { test_full { includeConfig 'conf/test_full.config' } affy { includeConfig 'conf/affy.config' } rnaseq { includeConfig 'conf/rnaseq.config' } + soft {includeConfig 'conf/soft.config'} test_affy { includeConfig 'conf/test_affy.config' } test_soft {includeConfig 'conf/test_soft.config' } } From 6e2480ca64081e8619b9a8dbcf63b5a19757dc02 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Mon, 11 Sep 2023 16:47:35 +0100 Subject: [PATCH 72/72] Remove rogue tab --- workflows/differentialabundance.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index c407dfcd..309b2e6d 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -304,7 +304,7 @@ workflow DIFFERENTIALABUNDANCE { } else{ - DESEQ2_NORM ( + DESEQ2_NORM ( ch_contrasts.first(), ch_samples_and_matrix, ch_control_features