Merge branch 'dev' into dev

bigbio · Mar 24, 2024 · 0c87e02 · 0c87e02
2 parents d090107 + 0b9c5ab
commit 0c87e02
Show file tree

Hide file tree

Showing 14 changed files with 384 additions and 24 deletions.
diff --git a/assets/adaptivecard.json b/assets/adaptivecard.json
@@ -54,7 +54,8 @@
                             "body": [
                                 {
                                     "type": "FactSet",
-                                    "facts": [<% out << summary.collect{ k,v -> "{\"title\": \"$k\", \"value\" : \"$v\"}"}.join(",\n") %>
+                                    "facts": [<% out << summary.collect{ k,v -> "{\"title\": \"$k\", \"value\" : \"$v\"}"
+                                        }.join(",\n") %>
                                     ]
                                 }
                             ]

diff --git a/bin/mzml_statistics.py b/bin/mzml_statistics.py
@@ -7,12 +7,12 @@
 import sys
 from pathlib import Path
 import sqlite3
-
+import re
 import pandas as pd
 from pyopenms import MSExperiment, MzMLFile
 
 
-def ms_dataframe(ms_path: str) -> None:
+def ms_dataframe(ms_path: str, id_only: bool = False) -> None:
     file_columns = [
         "SpectrumID",
         "MSLevel",
@@ -25,8 +25,9 @@ def ms_dataframe(ms_path: str) -> None:
         "AcquisitionDateTime",
     ]
 
-    def parse_mzml(file_name: str, file_columns: list):
+    def parse_mzml(file_name: str, file_columns: list, id_only: bool = False):
         info = []
+        psm_part_info = []
         exp = MSExperiment()
         acquisition_datetime = exp.getDateTime().get()
         MzMLFile().load(file_name, exp)
@@ -54,11 +55,23 @@ def parse_mzml(file_name: str, file_columns: list):
                 charge_state = spectrum.getPrecursors()[0].getCharge()
                 emz = spectrum.getPrecursors()[0].getMZ() if spectrum.getPrecursors()[0].getMZ() else None
                 info_list = [id_, MSLevel, charge_state, peak_per_ms, bpc, tic, rt, emz, acquisition_datetime]
+                mz_array = peaks_tuple[0]
+                intensity_array = peaks_tuple[1]
             else:
                 info_list = [id_, MSLevel, None, None, None, None, rt, None, acquisition_datetime]
 
+            if id_only and MSLevel == 2:
+                psm_part_info.append([re.findall(r"[scan|spectrum]=(\d+)", id_)[0], MSLevel, mz_array, intensity_array])
             info.append(info_list)
 
+        if id_only and len(psm_part_info) > 0:
+            pd.DataFrame(psm_part_info, columns=["scan", "ms_level", "mz", "intensity"]).to_csv(
+                f"{Path(ms_path).stem}_spectrum_df.csv",
+                mode="w",
+                index=False,
+                header=True,
+            )
+
         return pd.DataFrame(info, columns=file_columns)
 
     def parse_bruker_d(file_name: str, file_columns: list):
@@ -139,7 +152,7 @@ def parse_bruker_d(file_name: str, file_columns: list):
     if Path(ms_path).suffix == ".d" and Path(ms_path).is_dir():
         ms_df = parse_bruker_d(ms_path, file_columns)
     elif Path(ms_path).suffix in [".mzML", ".mzml"]:
-        ms_df = parse_mzml(ms_path, file_columns)
+        ms_df = parse_mzml(ms_path, file_columns, id_only)
     else:
         msg = f"Unrecognized or inexistent mass spec file '{ms_path}'"
         raise RuntimeError(msg)
@@ -155,7 +168,8 @@ def parse_bruker_d(file_name: str, file_columns: list):
 
 def main():
     ms_path = sys.argv[1]
-    ms_dataframe(ms_path)
+    id_only = sys.argv[2]
+    ms_dataframe(ms_path, id_only)
 
 
 if __name__ == "__main__":

diff --git a/bin/psm_conversion.py b/bin/psm_conversion.py
@@ -0,0 +1,117 @@
+#!/usr/bin/env python
+import numpy as np
+import pyopenms as oms
+import pandas as pd
+import re
+import os
+from pathlib import Path
+import sys
+
+_parquet_field = [
+    "sequence", "protein_accessions", "protein_start_positions", "protein_end_positions",
+    "modifications", "retention_time", "charge", "calc_mass_to_charge", "reference_file_name",
+    "scan_number", "peptidoform", "posterior_error_probability", "global_qvalue", "is_decoy",
+    "consensus_support", "mz_array", "intensity_array", "num_peaks", "search_engines", "id_scores", "hit_rank"
+]
+
+
+def mods_position(peptide):
+    pattern = re.compile(r"\((.*?)\)")
+    original_mods = pattern.findall(peptide)
+    peptide = re.sub(r"\(.*?\)", ".", peptide)
+    position = [i.start() for i in re.finditer(r"\.", peptide)]
+    for j in range(1, len(position)):
+        position[j] -= j
+
+    for k in range(0, len(original_mods)):
+        original_mods[k] = str(position[k]) + "-" + original_mods[k]
+
+    original_mods = [str(i) for i in original_mods] if len(original_mods) > 0 else np.nan
+
+    return original_mods
+
+
+def convert_psm(idxml, spectra_file, export_decoy_psm):
+    prot_ids = []
+    pep_ids = []
+    parquet_data = []
+    consensus_support = np.nan
+    mz_array = []
+    intensity_array = []
+    num_peaks = np.nan
+    id_scores = []
+    search_engines = []
+
+    oms.IdXMLFile().load(idxml, prot_ids, pep_ids)
+    if "ConsensusID" in prot_ids[0].getSearchEngine():
+        if prot_ids[0].getSearchParameters().metaValueExists("SE:MS-GF+"):
+            search_engines = ["MS-GF+"]
+        if prot_ids[0].getSearchParameters().metaValueExists("SE:Comet"):
+            search_engines.append("Comet")
+        if prot_ids[0].getSearchParameters().metaValueExists("SE:Sage"):
+            search_engines.append("Sage")
+    else:
+        search_engines = [prot_ids[0].getSearchEngine()]
+
+    reference_file_name = os.path.splitext(prot_ids[0].getMetaValue("spectra_data")[0].decode("UTF-8"))[0]
+    spectra_df = pd.read_csv(spectra_file) if spectra_file else None
+
+    for peptide_id in pep_ids:
+        retention_time = peptide_id.getRT()
+        calc_mass_to_charge = peptide_id.getMZ()
+        scan_number = int(re.findall(r"(spectrum|scan)=(\d+)", peptide_id.getMetaValue("spectrum_reference"))[0][1])
+
+        if isinstance(spectra_df, pd.DataFrame):
+            spectra = spectra_df[spectra_df["scan"] == scan_number]
+            mz_array = spectra["mz"].values[0]
+            intensity_array = spectra["intensity"].values[0]
+            num_peaks = len(mz_array)
+
+        for hit in peptide_id.getHits():
+            # if remove decoy when mapped to target+decoy?
+            is_decoy = 0 if hit.getMetaValue("target_decoy") == "target" else 1
+            if export_decoy_psm == "false" and is_decoy:
+                continue
+            global_qvalue = np.nan
+            if len(search_engines) > 1:
+                if "q-value" in peptide_id.getScoreType():
+                    global_qvalue = hit.getScore()
+                consensus_support = hit.getMetaValue("consensus_support")
+            elif search_engines == "Comet":
+                id_scores = ["Comet:Expectation value: " + str(hit.getScore())]
+            elif search_engines == "MS-GF+":
+                id_scores = ["MS-GF:SpecEValue: " + str(hit.getScore())]
+            elif search_engines == "Sage":
+                id_scores = ["Sage:hyperscore: " + str(hit.getScore())]
+
+            charge = hit.getCharge()
+            peptidoform = hit.getSequence().toString()
+            modifications = mods_position(peptidoform)
+            sequence = hit.getSequence().toUnmodifiedString()
+            protein_accessions = [ev.getProteinAccession() for ev in hit.getPeptideEvidences()]
+            posterior_error_probability = hit.getMetaValue("Posterior Error Probability_score")
+            protein_start_positions = [ev.getStart() for ev in hit.getPeptideEvidences()]
+            protein_end_positions = [ev.getEnd() for ev in hit.getPeptideEvidences()]
+            hit_rank = hit.getRank()
+
+            parquet_data.append([sequence, protein_accessions, protein_start_positions, protein_end_positions,
+                                 modifications, retention_time, charge, calc_mass_to_charge, reference_file_name,
+                                 scan_number, peptidoform, posterior_error_probability, global_qvalue, is_decoy,
+                                 consensus_support, mz_array, intensity_array, num_peaks, search_engines, id_scores,
+                                 hit_rank])
+
+    pd.DataFrame(parquet_data, columns=_parquet_field).to_csv(f"{Path(idxml).stem}_psm.csv",
+                                                              mode="w",
+                                                              index=False,
+                                                              header=True)
+
+
+def main():
+    idxml_path = sys.argv[1]
+    spectra_file = sys.argv[2]
+    export_decoy_psm = sys.argv[3]
+    convert_psm(idxml_path, spectra_file, export_decoy_psm)
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/conf/modules.config b/conf/modules.config
@@ -245,6 +245,16 @@ process {
         ]
     }
 
+    withName: '.*:DDA_ID:PSMFDRCONTROL:IDFILTER' {
+        ext.args    = "-score:pep \"$params.run_fdr_cutoff\""
+        ext.suffix  = '.idXML'
+        publishDir  = [
+            path: { "${params.outdir}/idfilter" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+
     // PROTEOMICSLFQ
     withName: '.*:LFQ:PROTEOMICSLFQ' {
         ext.args    = "-debug $params.plfq_debug"

diff --git a/modules/local/extract_psm/main.nf b/modules/local/extract_psm/main.nf
@@ -0,0 +1,36 @@
+process PSMCONVERSION {
+    tag "$meta.mzml_id"
+    label 'process_medium'
+
+    conda "bioconda::pyopenms=3.1.0"
+    if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) {
+        container "https://depot.galaxyproject.org/singularity/pyopenms:3.1.0--py39h9b8898c_0"
+    } else {
+        container "biocontainers/pyopenms:3.1.0--py39h9b8898c_0"
+    }
+
+    input:
+    tuple val(meta), path(idxml_file), path(spectrum_df)
+
+    output:
+    path "*_psm.csv", emit: psm_info
+    path "versions.yml", emit: version
+    path "*.log", emit: log
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.mzml_id}"
+
+
+    """
+    psm_conversion.py "${idxml_file}" \\
+        ${spectrum_df} \\
+        $params.export_decoy_psm \\
+        2>&1 | tee extract_idxml.log
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        pyopenms: \$(pip show pyopenms | grep "Version" | awk -F ': ' '{print \$2}')
+    END_VERSIONS
+    """
+}
diff --git a/modules/local/extract_psm/meta.yml b/modules/local/extract_psm/meta.yml
@@ -0,0 +1,34 @@
+name: PSMCONVERSION
+description: A module to extract PSM information from idXML file
+keywords:
+  - PSM
+  - conversion
+tools:
+  - custom:
+      description: |
+        A custom module for PSM extraction.
+      homepage: https://github.com/bigbio/quantms
+      documentation: https://github.com/bigbio/quantms/tree/readthedocs
+input:
+  - idxml_file:
+      type: file
+      description: idXML identification file
+      pattern: "*.idXML"
+  - spectrum_df:
+      type: file
+      description: spectrum data file
+      pattern: "_spectrum_df.csv"
+  - meta:
+      type: map
+      description: Groovy Map containing sample information
+output:
+  - psm_info:
+      type: file
+      description: PSM csv file
+      pattern: "*_psm.csv"
+  - version:
+      type: file
+      description: File containing software version
+      pattern: "versions.yml"
+authors:
+  - "@daichengxin"
diff --git a/modules/local/mzmlstatistics/main.nf b/modules/local/mzmlstatistics/main.nf
@@ -15,6 +15,7 @@ process MZMLSTATISTICS {
 
     output:
     path "*_ms_info.tsv", emit: ms_statistics
+    tuple val(meta), path("*_spectrum_df.csv"), emit: spectrum_df
     path "versions.yml", emit: version
     path "*.log", emit: log
 
@@ -24,6 +25,7 @@ process MZMLSTATISTICS {
 
     """
     mzml_statistics.py "${ms_file}" \\
+        $params.id_only \\
         2>&1 | tee mzml_statistics.log
 
     cat <<-END_VERSIONS > versions.yml

diff --git a/modules/local/mzmlstatistics/meta.yml b/modules/local/mzmlstatistics/meta.yml
@@ -19,6 +19,10 @@ output:
       type: file
       description: mzMLs statistics file
       pattern: "*_mzml_info.tsv"
+  - spectrum_df:
+      type: file
+      description: spectrum data file
+      pattern: "_spectrum_df.csv"
   - version:
       type: file
       description: File containing software version

diff --git a/modules/local/openms/thirdparty/searchenginemsgf/main.nf b/modules/local/openms/thirdparty/searchenginemsgf/main.nf
@@ -49,9 +49,11 @@ process SEARCHENGINEMSGF {
     }
 
     num_enzyme_termini = ""
+    max_missed_cleavages = "-max_missed_cleavages ${params.allowed_missed_cleavages}"
     if (meta.enzyme == "unspecific cleavage")
     {
         num_enzyme_termini = "none"
+        max_missed_cleavages = ""
     }
     else if (params.num_enzyme_termini == "fully")
     {
@@ -75,7 +77,7 @@ process SEARCHENGINEMSGF {
         -max_precursor_charge $params.max_precursor_charge \\
         -min_peptide_length $params.min_peptide_length \\
         -max_peptide_length $params.max_peptide_length \\
-        -max_missed_cleavages $params.allowed_missed_cleavages \\
+        ${max_missed_cleavages} \\
         -isotope_error_range $params.isotope_error_range \\
         -enzyme "${enzyme}" \\
         -tryptic ${msgf_num_enzyme_termini} \\

diff --git a/nextflow.config b/nextflow.config
@@ -15,13 +15,15 @@ params {
     local_input_type   = 'mzML'
     database           = null
     acquisition_method = null
+    id_only            = false
 
     // Input options
     input                      = null
 
     // Tools flags
     posterior_probabilities  = 'percolator'
     add_decoys               = false
+    skip_rescoring           = false
     search_engines           = 'comet'
     sage_processes           = 1
     run_fdr_cutoff           = 0.10
@@ -106,6 +108,9 @@ params {
     // IDPEP flags
     outlier_handling = "none"
 
+    // DDA_ID flags
+    export_decoy_psm   = true
+
     // Percolator flags
     train_FDR                    = 0.05
     test_FDR                     = 0.05

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -58,6 +58,18 @@
                     "description": "Proteomics data acquisition method",
                     "enum": ["dda", "dia"],
                     "fa_icon": "far fa-list-ol"
+                },
+                "id_only": {
+                    "type": "boolean",
+                    "description": "Only perform identification subworkflow.",
+                    "fa_icon": "far fa-check-square",
+                    "help_text": "Only perform identification subworkflow for specific cases."
+                },
+                "export_decoy_psm": {
+                    "type": "boolean",
+                    "description": "Whether export PSM from decoy in final identification results",
+                    "fa_icon": "far fa-check-square",
+                    "help_text": "Whether export PSM from decoy in final identification results for dda_id subworkflow for specific cases."
                 }
             }
         },
@@ -416,6 +428,12 @@
             "description": "Choose between different rescoring/posterior probability calculation methods and set them up.",
             "default": "",
             "properties": {
+                "skip_rescoring": {
+                    "type": "boolean",
+                    "description": "Skip PSM rescoring steps for specific cases, such as studying pure search engine results and search engine ranks",
+                    "default": false,
+                    "fa_icon": "far fa-check-square"
+                },
                 "posterior_probabilities": {
                     "type": "string",
                     "description": "How to calculate posterior probabilities for PSMs:\n\n* 'percolator' = Re-score based on PSM-feature-based SVM and transform distance\n    to hyperplane for posteriors\n* 'fit_distributions' = Fit positive and negative distributions to scores\n    (similar to PeptideProphet)",