Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

small fixs #218

Merged
merged 3 commits into from
Sep 12, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 40 additions & 62 deletions bin/diann_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,14 @@ def cli():
@click.option("--pg_matrix", "-pg")
@click.option("--pr_matrix", "-pr")
@click.option("--dia_params", "-p")
@click.option("--diann_version", "-v")
@click.option("--fasta", "-f")
@click.option("--charge", "-c")
@click.option("--missed_cleavages", "-m")
@click.option("--qvalue_threshold", "-q", type=float)
@click.pass_context

def convert(ctx, diann_report, exp_design, pg_matrix, pr_matrix, dia_params, fasta, charge, missed_cleavages, qvalue_threshold):
def convert(ctx, diann_report, exp_design, pg_matrix, pr_matrix, dia_params, diann_version, fasta, charge, missed_cleavages, qvalue_threshold):
"""This function is designed to convert the DIA-NN output into three standard formats: MSstats, Triqler and mzTab. These documents are
used for quality control and downstream analysis.

Expand All @@ -38,6 +39,8 @@ def convert(ctx, diann_report, exp_design, pg_matrix, pr_matrix, dia_params, fas
:type pr_matrix: str
:param dia_params: A list contains DIA parameters
:type dia_params: list
:param diann_version: Version of DIA-NN
:type diann_version: str
:param fasta: Path to the fasta file
:type fasta: str
:param charge: The charge assigned by DIA-NN(max_precursor_charge)
Expand Down Expand Up @@ -102,36 +105,37 @@ def convert(ctx, diann_report, exp_design, pg_matrix, pr_matrix, dia_params, fas
out_triqler.to_csv(os.path.splitext(os.path.basename(exp_design))[0] + '_triqler_in.tsv', sep='\t', index=False)

# Convert to mzTab
fasta_df = pd.DataFrame()
entries = []
f = FASTAFile()
f.load(fasta, entries)
line = 0
for e in entries:
fasta_df.loc[line, "id"] = e.identifier
fasta_df.loc[line, "seq"] = e.sequence
fasta_df.loc[line, "len"] = len(e.sequence)
line += 1

index_ref = f_table
index_ref.loc[:, "ms_run"] = index_ref.apply(lambda x: x["Fraction_Group"], axis=1)
index_ref.loc[:, "study_variable"] = index_ref.apply(lambda x: x["Sample"], axis=1)
index_ref.loc[:, "ms_run"] = index_ref.loc[:, "ms_run"].astype("int")
index_ref.loc[:, "study_variable"] = index_ref.loc[:, "study_variable"].astype("int")
report[["ms_run", "study_variable"]] = report.apply(lambda x: add_info(x["Run"], index_ref), axis = 1, result_type = "expand")

(MTD, database) = mztab_MTD(index_ref, dia_params, fasta, charge, missed_cleavages)
PRH = mztab_PRH(report, pg, index_ref, database, fasta_df)
PEH = mztab_PEH(report, pr, precursor_list, index_ref, database, fasta_df)
PSH = mztab_PSH(report, database, fasta_df)
MTD.loc["", :] = ""
PRH.loc[len(PRH) + 1, :] = ""
PEH.loc[len(PEH) + 1, :] = ""
with open(os.path.splitext(os.path.basename(exp_design))[0] + '_out.mztab', "w", newline = "") as f:
MTD.to_csv(f, mode="w", sep = '\t', index = False, header = False)
PRH.to_csv(f, mode="w", sep = '\t', index = False, header = True)
PEH.to_csv(f, mode="w", sep = '\t', index = False, header = True)
PSH.to_csv(f, mode="w", sep = '\t', index = False, header = True)
if diann_version == "1.8.1":
fasta_df = pd.DataFrame()
entries = []
f = FASTAFile()
f.load(fasta, entries)
line = 0
for e in entries:
fasta_df.loc[line, "id"] = e.identifier
fasta_df.loc[line, "seq"] = e.sequence
fasta_df.loc[line, "len"] = len(e.sequence)
line += 1

index_ref = f_table
index_ref.loc[:, "ms_run"] = index_ref.apply(lambda x: x["Fraction_Group"], axis=1)
index_ref.loc[:, "study_variable"] = index_ref.apply(lambda x: x["Sample"], axis=1)
index_ref.loc[:, "ms_run"] = index_ref.loc[:, "ms_run"].astype("int")
index_ref.loc[:, "study_variable"] = index_ref.loc[:, "study_variable"].astype("int")
report[["ms_run", "study_variable"]] = report.apply(lambda x: add_info(x["Run"], index_ref), axis = 1, result_type = "expand")

(MTD, database) = mztab_MTD(index_ref, dia_params, fasta, charge, missed_cleavages)
PRH = mztab_PRH(report, pg, index_ref, database, fasta_df)
PEH = mztab_PEH(report, pr, precursor_list, index_ref, database, fasta_df)
PSH = mztab_PSH(report, database, fasta_df)
MTD.loc["", :] = ""
PRH.loc[len(PRH) + 1, :] = ""
PEH.loc[len(PEH) + 1, :] = ""
with open(os.path.splitext(os.path.basename(exp_design))[0] + '_out.mztab', "w", newline = "") as f:
MTD.to_csv(f, mode="w", sep = '\t', index = False, header = False)
PRH.to_csv(f, mode="w", sep = '\t', index = False, header = True)
PEH.to_csv(f, mode="w", sep = '\t', index = False, header = True)
PSH.to_csv(f, mode="w", sep = '\t', index = False, header = True)


def query_expdesign_value(reference, f_table, s_table):
Expand Down Expand Up @@ -214,6 +218,7 @@ def mztab_MTD(index_ref, dia_params, fasta, charge, missed_cleavages):
:rtype: pandas.core.frame.DataFrame
"""
dia_params_list = dia_params.split(";")
dia_params_list = ["null" if i == "" else i for i in dia_params_list]
FragmentMassTolerance = dia_params_list[0]
FragmentMassToleranceUnit = dia_params_list[1]
PrecursorMassTolerance = dia_params_list[2]
Expand Down Expand Up @@ -314,7 +319,7 @@ def mztab_PRH(report, pg, index_ref, database, fasta_df):
file = list(pg.columns[5:])
col = {}
for i in file:
col[i] = "protein_abundance_assay[" + str(index_ref[index_ref["run"] == i.split("/")[-1].split(".")[-2]]["ms_run"].values[0]) + "]"
col[i] = "protein_abundance_assay[" + str(index_ref[index_ref["run"] == os.path.splitext(os.path.split(i)[1])[0]]["ms_run"].values[0]) + "]"

pg = pg.rename(columns = col)
pg.loc[:, "opt_global_result_type"] = pg.apply(lambda x: classify_result_type(x), axis=1, result_type="expand")
Expand Down Expand Up @@ -399,7 +404,7 @@ def mztab_PEH(report, pr, precursor_list, index_ref, database, fasta_df):
out_mztab_PEH.loc[:, "opt_global_cv_MS:1000889_peptidoform_sequence"] = out_mztab_PEH.apply(
lambda x: AASequence.fromString(x["opt_global_cv_MS:1000889_peptidoform_sequence"]).toString(), axis=1)

out_mztab_PEH.loc[:, "unique"] = out_mztab_PEH.apply(lambda x: Unique(x["accession"], x["sequence"], fasta_df), axis=1, result_type="expand")
out_mztab_PEH.loc[:, "unique"] = out_mztab_PEH.apply(lambda x: "0" if ";" in str(x["accession"]) else "1", axis=1, result_type="expand")

null_col = ["database_version", "search_engine", "retention_time_window", "mass_to_charge"]
for i in null_col:
Expand Down Expand Up @@ -460,10 +465,10 @@ def mztab_PSH(report, database, fasta_df):

out_mztab_PSH.loc[:, "opt_global_cv_MS:1002217_decoy_peptide"] = "0"
out_mztab_PSH.loc[:, "PSM_ID"] = out_mztab_PSH.index
out_mztab_PSH.loc[:, "unique"] = out_mztab_PSH.apply(lambda x: Unique(x["accession"], x["sequence"], fasta_df), axis=1, result_type="expand")
out_mztab_PSH.loc[:, "unique"] = out_mztab_PSH.apply(lambda x: "0" if ";" in str(x["accession"]) else "1", axis=1, result_type="expand")
out_mztab_PSH.loc[:, "database"] = database

null_col = ["database_version", "spectra_ref", "search_engine", "exp_mass_to_charge", "pre", "post",
null_col = ["database_version", "spectra_ref", "search_engine", "unique", "exp_mass_to_charge", "pre", "post",
"start", "end", "opt_global_feature_id", "opt_global_map_index", "opt_global_spectrum_reference"]
for i in null_col:
out_mztab_PSH.loc[:, i] = "null"
Expand Down Expand Up @@ -516,33 +521,6 @@ def classify_result_type(target):
return "single_protein"


def Unique(accession, pep_seq, fasta_df):
'''Find the location of the peptide and determine if the peptide is unique

:param target: The value of "accession" column in out_mztab_PEH or out_mztab_PSH
:type target: str
:param pep_seq: Peptide sequence
:type sep_seq: str
:param fasta_df: A dataframe contains protein IDs, sequences and lengths
:type fasta_df: pandas.core.frame.DataFrame
:return: Unique flag(1 or 0)
:rtype: str
'''
unique = 1
for i in accession.split(";"):
pro_seq = fasta_df[fasta_df["id"].str.contains(i)]["seq"].values[0]
result = re.finditer(pep_seq, pro_seq)
section_list = []
if result:
for i in result:
section_list.append([i.span()[0],i.span()[1]-1])

if len(section_list) != 1:
unique = 0

return str(unique)


def calculate_protein_coverage(report, target, reference, fasta_df):
"""Calculate protein coverage.
:param report: Dataframe for Dia-NN main report
Expand Down
4 changes: 4 additions & 0 deletions modules/local/diannconvert/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ process DIANNCONVERT {
path(report_pr)
val(meta)
path(fasta)
path(diann_version)

output:
path "*msstats_in.csv", emit: out_msstats
Expand All @@ -29,12 +30,15 @@ process DIANNCONVERT {
meta.precursormasstoleranceunit,meta.enzyme,meta.fixedmodifications,meta.variablemodifications].join(';')

"""
diann_version=`cat ${diann_version} | grep DIA-NN | awk -F ": " '{print \$2}'`

diann_convert.py convert \\
--diann_report "${report}" \\
--exp_design "${exp_design}" \\
--pg_matrix "${report_pg}" \\
--pr_matrix "${report_pr}" \\
--dia_params "${dia_params}" \\
--diann_version "\${diann_version}" \\
--fasta "${fasta}" \\
--charge $params.max_precursor_charge \\
--missed_cleavages $params.allowed_missed_cleavages \\
Expand Down
5 changes: 3 additions & 2 deletions workflows/dia.nf
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,9 @@ workflow DIA {
//
// MODULE: DIANNCONVERT
//
DIANNCONVERT(DIANNSUMMARY.out.main_report, ch_expdesign, DIANNSUMMARY.out.pg_matrix, DIANNSUMMARY.out.pr_matrix,
ch_result.meta.first(), params.database)
log.info "DIANNCONVERT is based on the output of DIA-NN 1.8.1, other versions of DIA-NN do not support mzTab conversion."
DIANNCONVERT(DIANNSUMMARY.out.main_report, ch_expdesign, DIANNSUMMARY.out.pg_matrix, DIANNSUMMARY.out.pr_matrix,
ch_result.meta.first(), params.database, DIANNSUMMARY.out.version)
ch_software_versions = ch_software_versions.mix(DIANNCONVERT.out.version.ifEmpty(null))

//
Expand Down