nf-core · WackerO · Apr 30, 2024 · Mar 27, 2024 · Mar 27, 2024 · Mar 27, 2024
diff --git a/.nf-core.yml b/.nf-core.yml
@@ -1 +1,8 @@
 repository_type: pipeline
+lint:
+  nextflow_config:
+    - config_defaults:
+        - params.logo_file
+        - params.css_file
+        - params.citations_file
+        - params.report_file
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### `Added`
 
 - [[#259](https://github.com/nf-core/differentialabundance/pull/259)] - Bump gtf2featureannotation to fix GTF handling error ([@pinin4fjords](https://github.com/pinin4fjords), review by [@WackerO](https://github.com/WackerO))
+- [[#257](https://github.com/nf-core/differentialabundance/pull/257)] - Added maxquant profile to nextflow.config to make it available ([@WackerO](https://github.com/WackerO), review by [@pinin4fjords](https://github.com/pinin4fjords))
 - [[#254](https://github.com/nf-core/differentialabundance/pull/254)] - Some parameter changes, added qbic credits ([@WackerO](https://github.com/WackerO), review by [@pinin4fjords](https://github.com/pinin4fjords))
 - [[#250](https://github.com/nf-core/differentialabundance/pull/250)] - Template update for nf-core/tools v2.13.1 ([@WackerO](https://github.com/WackerO), review by [@pinin4fjords](https://github.com/pinin4fjords))
 - [[#244](https://github.com/nf-core/differentialabundance/pull/244)] - Add pipeline params for matrixfilter NA options ([@WackerO](https://github.com/WackerO), review by [@pinin4fjords](https://github.com/pinin4fjords))
@@ -19,6 +20,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### `Fixed`
 
+- [[#257](https://github.com/nf-core/differentialabundance/pull/257)] - Fixed FILTER_DIFFTABLE module, updated PROTEUS module to better handle whitespace in prefix param, made docs clearer ([@WackerO](https://github.com/WackerO), review by [@pinin4fjords](https://github.com/pinin4fjords))
 - [[#254](https://github.com/nf-core/differentialabundance/pull/254)] - Made differential_file_suffix optional ([@WackerO](https://github.com/WackerO), review by [@pinin4fjords](https://github.com/pinin4fjords))
 - [[#240](https://github.com/nf-core/differentialabundance/pull/240)] - Publish GSEA reports ([@pinin4fjords](https://github.com/pinin4fjords), review by [@WackerO](https://github.com/WackerO))
 - [[#231](https://github.com/nf-core/differentialabundance/pull/231)] - Update GSEA module to fix butterfly plot bug ([@WackerO](https://github.com/WackerO), review by [@pinin4fjords](https://github.com/pinin4fjords))

diff --git a/assets/differentialabundance_report.Rmd b/assets/differentialabundance_report.Rmd
@@ -267,8 +267,9 @@ for (r in seq_along(contributors)) {
 
 ```{r, echo=FALSE}
 observations <- read_metadata(file.path(params$input_dir, params$observations), id_col = params$observations_id_col)
-if (! params$observations_name_col %in% colnames(observations)){
-    stop(paste('Invalid observation name column specified: ', params$observations_name_col, paste0('(Valid values are: ', paste(colnames(observations), collapse=', '),')')))
+observations_name_col <- ifelse(!is.null(params$observations_name_col), params$observations_name_col, params$observations_id_col)
+if (! observations_name_col %in% colnames(observations)){
+    stop(paste('Invalid observation name column specified: ', observations_name_col, paste0('(Valid values are: ', paste(colnames(observations), collapse=', '),')')))
 }
 
 if (! is.null(params$features)){
@@ -305,7 +306,7 @@ assay_data <- lapply(assay_files, function(x) {
         row.names = 1
         )
     )
-    colnames(mat) <- observations[[params$observations_name_col]][match(colnames(mat), rownames(observations))]
+    colnames(mat) <- observations[[observations_name_col]][match(colnames(mat), rownames(observations))]
     mat
 })
 
@@ -316,7 +317,7 @@ if (!is.null(params$features_log2_assays)) {
 assay_data <- cond_log2_transform_assays(assay_data, params$features_log2_assays)
 
 # Now we can rename the observations rows using the title field
-rownames(observations) <- observations[[params$observations_name_col]]
+rownames(observations) <- observations[[observations_name_col]]
 
 # Run PCA early so we can understand how important each variable is
 pca_datas <- lapply(names(assay_data), function(assay_type){
@@ -547,7 +548,7 @@ Whiskers in the above boxplots show `r params$exploratory_whisker_distance` time
 plotly_densityplot(
     assay_data,
     experiment = observations,
-    colorby = params$observations_name_col,
+    colorby = observations_name_col,
     expressiontype = paste("count per", params$features_type),
     makeColorScale(length(unique(observations[[params$observations_id_col]])), palette = "Set1")
 )

diff --git a/conf/maxquant.config b/conf/maxquant.config
@@ -38,7 +38,7 @@ params {
     differential_feature_name_column = "Majority protein IDs"
 
     // Proteus options
-    proteus_measurecol_prefix = 'LFQ intensity '
+    proteus_measurecol_prefix = 'LFQ intensity'
 
     // Shiny does not work for this datatype
     shinyngs_build_app               = false

diff --git a/conf/modules.config b/conf/modules.config
@@ -126,7 +126,7 @@ process {
             "--contrast_variable \"${meta.id}\"",
             "--sample_id_col \"${params.observations_id_col}\"",
             "--protein_id_col \"${params.features_id_col}\"",
-            "--measure_col_prefix \"${params.proteus_measurecol_prefix}\"",
+            "--measure_col_prefix \"${params.proteus_measurecol_prefix}\"".replaceAll(~/_s\b/, ' '),
             "--norm_function $params.proteus_norm_function",
             "--plotsd_method $params.proteus_plotsd_method",
             "--plotmv_loess $params.proteus_plotmv_loess",

diff --git a/docs/usage.md b/docs/usage.md
@@ -23,7 +23,11 @@ With the above in mind, running this workflow requires:
 --input '[path to samplesheet file]'
 ```
 
-This may well be the same sample sheet used to generate the input matrix. For example, in RNA-seq this might be the same sample sheet, perhaps derived from [fetchngs](https://github.com/nf-core/fetchngs), that was input to the [RNA-seq workflow](https://github.com/nf-core/rnaseq). It may be necessary to add columns that describe the groups you want to compare.
+This may well be the same sample sheet used to generate the input matrix. For example, in RNA-seq this might be the same sample sheet, perhaps derived from [fetchngs](https://github.com/nf-core/fetchngs), that was input to the [RNA-seq workflow](https://github.com/nf-core/rnaseq). It may be necessary to add columns that describe the groups you want to compare. The columns that the pipeline requires are:
+
+- a column listing the sample IDs (must be the same IDs as in the abundance matrix), in the example below it is called 'sample'. For some study_types, this column might need to be filled in with file names, e.g. when doing an affymetrix analysis.
+- one or more columns describing conditions for the differential analysis. In the example below it is called 'condition'
+- optionally one or more columns describing sample batches or similar which you want to be considered in the analysis. In the example below it is called 'batch'
 
 For example:
 
@@ -96,7 +100,7 @@ So we **do not recommend** raw counts files such as `salmon.merged.gene_counts.t
 --matrix '[path to matrix file]'
 ```
 
-This is the proteinGroups.txt file produced by MaxQuant. It is a tab-separated matrix file with a column for every observation (plus additional columns for other types of measurements and information); each row contains these data for a set of proteins. The parameters `--observations_id_col` and `--features_id_col` define which of the associated fields should be matched in those inputs. The parameter `--proteus_measurecol_prefix` defines which prefix is used to extract those matrix columns which contain the measurements to be used. For example, the default `LFQ intensity ` will indicate that columns like LFQ intensity S1, LFQ intensity S2, LFQ intensity S3 etc. are used (do not forget trailing whitespace in this parameter, if required!).
+This is the proteinGroups.txt file produced by MaxQuant. It is a tab-separated matrix file with a column for every observation (plus additional columns for other types of measurements and information); each row contains these data for a set of proteins. The parameters `--observations_id_col` and `--features_id_col` define which of the associated fields should be matched in those inputs. The parameter `--proteus_measurecol_prefix` defines which prefix is used to extract those matrix columns which contain the measurements to be used. For example, the default `LFQ intensity ` will indicate that columns like LFQ intensity S1, LFQ intensity S2, LFQ intensity S3 etc. are used (one whitespace is automatically added if necessary).
 
 ### Affymetrix microarrays
 

diff --git a/modules.json b/modules.json
@@ -62,7 +62,7 @@
                     },
                     "proteus/readproteingroups": {
                         "branch": "master",
-                        "git_sha": "516189e968feb4ebdd9921806988b4c12b4ac2dc",
+                        "git_sha": "a069b29783583c219c1f23ed3dcf64a5aee1340b",
                         "installed_by": ["modules"]
                     },
                     "rmarkdownnotebook": {

diff --git a/modules/local/filter_difftable.nf b/modules/local/filter_difftable.nf
@@ -2,10 +2,10 @@ process FILTER_DIFFTABLE {
 
     label 'process_single'
 
-    conda "conda-forge::gawk=5.1.0"
+    conda "pandas=1.5.2"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/gawk:5.1.0' :
-        'biocontainers/gawk:5.1.0' }"
+        'https://depot.galaxyproject.org/singularity/pandas:1.5.2' :
+        'biocontainers/pandas:1.5.2' }"
 
     input:
     tuple val(meta), path(input_file)
@@ -20,42 +20,32 @@ process FILTER_DIFFTABLE {
     task.ext.when == null || task.ext.when
 
     script:
-    def VERSION = '9.1' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions.
     """
-    output_file=\$(echo $input_file | sed 's/\\(.*\\)\\..*/\\1/')_filtered.tsv
+    #!/usr/bin/env python
 
-    # Function to find column number
-    find_column_number() {
-        awk -v column="\$2" '{for(i=1;i<=NF;i++) if (\$i == column) {print i; exit}}' <<< "\$(head -n 1 "\$1")"
-    }
+    from os import path
+    import pandas as pd
+    import platform
+    from sys import exit
 
-    # Extract column numbers
-    logFC_col=\$(find_column_number "$input_file" "log2FoldChange")
-    padj_col=\$(find_column_number "$input_file" "padj")
-
-    # Prepare the output file
-    head -n 1 "$input_file" > "\${output_file}.tmp"
-
-    # The following snippet performs the following checks on each row (add +0.0 to the numbers so that they are definitely treated as numerics):
-    #
     # 1. Check that the current logFC/padj is not NA
     # 2. Check that the current logFC is >= threshold (abs does not work, so use a workaround)
     # 3. Check that the current padj is <= threshold
-    #
     # If this is true, the row is written to the new file, otherwise not
+    if not any("$input_file".endswith(ext) for ext in [".csv", ".tsv", ".txt"]):
+        exit("Please provide a .csv, .tsv or .txt file!")
 
-    awk -F'\\t' -v logFC_col="\$logFC_col" -v padj_col="\$padj_col" -v logFC_thresh="$logFC_threshold" -v padj_thresh="$padj_threshold" '
-        NR > 1 && \$logFC_col != "NA" && \$padj_col != "NA" &&
-        ((\$logFC_col+0.0 >= logFC_thresh+0.0) || (-\$logFC_col+0.0 >= logFC_thresh+0.0)) &&
-        \$padj_col+0.0 <= padj_thresh+0.0 { print }
-    ' "$input_file" >> "\${output_file}.tmp"
+    table = pd.read_csv("$input_file", sep=("," if "$input_file".endswith(".csv") else "\t"), header=0)
+    table = table[~table["$logFC_column"].isna() &
+                ~table["$padj_column"].isna() &
+                (pd.to_numeric(table["$logFC_column"], errors='coerce').abs() >= float("$logFC_threshold")) &
+                (pd.to_numeric(table["$padj_column"], errors='coerce') <= float("$padj_threshold"))]
 
-    # Rename temporary file to final output file
-    mv "\${output_file}.tmp" "\$output_file"
+    table.to_csv(path.splitext(path.basename("$input_file"))[0]+"_filtered.tsv", sep="\t", index=False)
 
-    cat <<-END_VERSIONS > versions.yml
-    "${task.process}":
-        bash: \$(echo \$(bash --version | grep -Eo 'version [[:alnum:].]+' | sed 's/version //'))
-    END_VERSIONS
+    with open('versions.yml', 'a') as version_file:
+        version_file.write('"${task.process}":' + "\\n")
+        version_file.write("    python: " + (platform.python_version()) + "\\n")
+        version_file.write("    pandas: " + str(pd.__version__) + "\\n")
     """
 }
diff --git a/modules/nf-core/proteus/readproteingroups/environment.yml b/modules/nf-core/proteus/readproteingroups/environment.yml
diff --git a/modules/nf-core/proteus/readproteingroups/main.nf b/modules/nf-core/proteus/readproteingroups/main.nf
diff --git a/modules/nf-core/proteus/readproteingroups/meta.yml b/modules/nf-core/proteus/readproteingroups/meta.yml
diff --git a/modules/nf-core/proteus/readproteingroups/templates/proteus_readproteingroups.R b/modules/nf-core/proteus/readproteingroups/templates/proteus_readproteingroups.R
diff --git a/nextflow.config b/nextflow.config
@@ -22,10 +22,10 @@ params {
     sizefactors_from_controls  = false
 
     // Reporting
-    logo_file               = "docs/images/nf-core-differentialabundance_logo_light.png"
-    css_file                = "assets/nf-core_style.css"
-    citations_file          = "CITATIONS.md"
-    report_file             = "assets/differentialabundance_report.Rmd"
+    logo_file               = "$projectDir/docs/images/nf-core-differentialabundance_logo_light.png"
+    css_file                = "$projectDir/assets/nf-core_style.css"
+    citations_file          = "$projectDir/CITATIONS.md"
+    report_file             = "$projectDir/assets/differentialabundance_report.Rmd"
     report_title            = null
     report_author           = null
     report_contributors     = null
@@ -63,7 +63,7 @@ params {
     affy_build_annotation    = true
 
     // Proteus-specific options
-    proteus_measurecol_prefix = 'LFQ intensity '
+    proteus_measurecol_prefix = 'LFQ intensity'
     proteus_norm_function     = 'normalizeMedian'
     proteus_plotsd_method     = 'violin'
     proteus_plotmv_loess      =  true
@@ -342,6 +342,7 @@ profiles {
     test_nogtf { includeConfig 'conf/test_nogtf.config' }
     test_full { includeConfig 'conf/test_full.config' }
     affy { includeConfig 'conf/affy.config' }
+    maxquant { includeConfig 'conf/maxquant.config' }
     rnaseq { includeConfig 'conf/rnaseq.config' }
     soft {includeConfig 'conf/soft.config'}
     test_affy { includeConfig 'conf/test_affy.config' }