From e43fecf455eb936976a306f7cab52c8620594efa Mon Sep 17 00:00:00 2001 From: Chengxin Dai <37200167+daichengxin@users.noreply.github.com> Date: Tue, 18 Feb 2025 15:52:50 +0800 Subject: [PATCH 1/9] Support for DIA-NN 2.0 --- pyproject.toml | 2 +- quantmsutils/__init__.py | 2 +- quantmsutils/diann/diann2mztab.py | 136 ++++++++++++++++-------------- 3 files changed, 77 insertions(+), 63 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 36452be..4d6d364 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ name = "quantms-utils" description = "Python scripts and helpers for the quantMS workflow" readme = "README.md" license = "MIT" -version = "0.0.17" +version = "0.0.19" authors = [ "Yasset Perez-Riverol ", "Dai Chengxin ", diff --git a/quantmsutils/__init__.py b/quantmsutils/__init__.py index f18e5d0..a11f0b4 100644 --- a/quantmsutils/__init__.py +++ b/quantmsutils/__init__.py @@ -1 +1 @@ -__version__ = "0.0.18" +__version__ = "0.0.19" diff --git a/quantmsutils/diann/diann2mztab.py b/quantmsutils/diann/diann2mztab.py index a405737..cdb535c 100644 --- a/quantmsutils/diann/diann2mztab.py +++ b/quantmsutils/diann/diann2mztab.py @@ -90,7 +90,6 @@ def diann2mztab( "Modified.Sequence", "Precursor.Charge", "Precursor.Quantity", - "File.Name", "Run", ] @@ -101,7 +100,6 @@ def diann2mztab( "PeptideSequence", "PrecursorCharge", "Intensity", - "Reference", "Run", ] out_msstats = out_msstats[out_msstats["Intensity"] != 0] @@ -113,12 +111,6 @@ def diann2mztab( out_msstats["FragmentIon"] = "NA" out_msstats["ProductCharge"] = "0" out_msstats["IsotopeLabelType"] = "L" - unique_reference_map = {k: os.path.basename(k) for k in out_msstats["Reference"].unique()} - out_msstats["Reference"] = out_msstats["Reference"].map(unique_reference_map) - del unique_reference_map - - logger.debug("\n\nReference Column >>>") - logger.debug(out_msstats["Reference"]) logger.debug(f"\n\nout_msstats ({out_msstats.shape}) >>>") logger.debug(out_msstats.head(5)) @@ -325,7 +317,11 @@ def find_first_file_with_suffix(self, suffix: str) -> os.PathLike: @property def report(self) -> os.PathLike: - return self.find_first_file_with_suffix("report.tsv") + # DIA-NN 1.8.1 return tsv format, but DIA-NN 2.0 only return parquet + try: + return self.find_first_file_with_suffix("report.tsv") + except FileNotFoundError as e: + return self.find_first_file_with_suffix("report.parquet") @property def pg_matrix(self) -> os.PathLike: @@ -362,7 +358,7 @@ def diann_version(self) -> str: return diann_version_id def validate_diann_version(self) -> None: - supported_diann_versions = ["1.8.1", "1.9.beta.1", "1.9.2"] + supported_diann_versions = ["1.8.1", "2.0", "2.0.1", "2.0.2"] if self.diann_version not in supported_diann_versions: raise ValueError(f"Unsupported DIANN version {self.diann_version}") @@ -443,7 +439,6 @@ def convert_to_mztab( def main_report_df(self, qvalue_threshold: float) -> pd.DataFrame: remain_cols = [ - "File.Name", "Run", "Protein.Group", "Protein.Names", @@ -465,7 +460,17 @@ def main_report_df(self, qvalue_threshold: float) -> pd.DataFrame: "Global.PG.Q.Value", "MS2.Scan", ] - report = pd.read_csv(self.report, sep="\t", header=0, usecols=remain_cols) + if ".tsv" in self.report.suffix: + report = pd.read_csv(self.report, sep="\t", header=0, usecols=remain_cols) + else: + report = pd.read_parquet(self.report) + # filter decoy + if "Decoy" in report.columns: + logger.debug( + f"Filtering decoy from report: {len(report)} rows" + ) + report = report[report["Decoy"] == 0] + logger.debug(f"Report filtered, {len(report)} rows remaining") # filter based on qvalue parameter for downstream analysiss logger.debug( @@ -1087,60 +1092,69 @@ def __find_info(directory, n): ].astype(str) # TODO seconds returned from precursor.getRT() - target.loc[:, "RT"] = target.apply(lambda x: x["RT"] / 60, axis=1) + # RT column of DIA-NN 2.0.* is float32 type, but RT column of DIA-NN 1.8.* is float64 type + target.loc[:, "RT"] = target.apply(lambda x: (x["RT"] / 60), axis=1) + group['RT'] = group['RT'].astype('float64') rt_matched = pd.merge_asof(group, target, on="RT", direction="nearest") - new_target = target - new_target.columns = [ - "scan_RT", - "scan_opt_global_spectrum_reference", - "MS2.Scan", - "scan_exp_mass_to_charge", - ] - scan_matched = pd.merge(rt_matched, new_target, on="MS2.Scan") + # DIA-NN can't export MS2.Scan column in some versions (eg. DIA-NN 2.0, DIA-NN 2.0.1). + # So we only match MS2 by RT. + if "MS2.Scan" in group.columns: + logger.info("Mapping DIA-NN MS2 to mzML by MS2.Scan and RT columns") + new_target = target + new_target.columns = [ + "scan_RT", + "scan_opt_global_spectrum_reference", + "MS2.Scan", + "scan_exp_mass_to_charge", + ] + scan_matched = pd.merge(rt_matched, new_target, on="MS2.Scan") - # Cross validation spectrum ID between scan matched and RT matched - # Keep Scan matched When RT matched and DIA-NN Scan matched are inconsistent in mzML. - scan_matched["unassigned_matched"] = scan_matched.apply( - lambda row: 1 if row["MS2.Scan"] != row["DIANN-intraID"] else 0, axis=1 - ) - if len(scan_matched[scan_matched["unassigned_matched"] == 1]) > 0: - v_str = scan_matched[scan_matched["unassigned_matched"] == 1]["MS2.Scan"].tolist() - logger.info( - f"RT matched and DIA-NN Scan matched are inconsistent in mzML. Keep Scan matched: {v_str}" - ) - scan_matched.drop( - [ - "RT", - "opt_global_spectrum_reference", - "DIANN-intraID", - "exp_mass_to_charge", - "unassigned_matched", - ], - inplace=True, - axis=1, - ) - scan_matched.rename( - columns={ - "scan_RT": "RT", - "scan_opt_global_spectrum_reference": "opt_global_spectrum_reference", - "scan_exp_mass_to_charge": "exp_mass_to_charge", - }, - inplace=True, + # Cross validation spectrum ID between scan matched and RT matched + # Keep Scan matched When RT matched and DIA-NN Scan matched are inconsistent in mzML. + scan_matched["unassigned_matched"] = scan_matched.apply( + lambda row: 1 if row["MS2.Scan"] != row["DIANN-intraID"] else 0, axis=1 ) + if len(scan_matched[scan_matched["unassigned_matched"] == 1]) > 0: + v_str = scan_matched[scan_matched["unassigned_matched"] == 1]["MS2.Scan"].tolist() + logger.info( + f"RT matched and DIA-NN Scan matched are inconsistent in mzML. Keep Scan matched: {v_str}" + ) + scan_matched.drop( + [ + "RT", + "opt_global_spectrum_reference", + "DIANN-intraID", + "exp_mass_to_charge", + "unassigned_matched", + ], + inplace=True, + axis=1, + ) + scan_matched.rename( + columns={ + "scan_RT": "RT", + "scan_opt_global_spectrum_reference": "opt_global_spectrum_reference", + "scan_exp_mass_to_charge": "exp_mass_to_charge", + }, + inplace=True, + ) + else: + scan_matched.drop( + [ + "scan_RT", + "scan_opt_global_spectrum_reference", + "scan_exp_mass_to_charge", + "unassigned_matched", + ], + inplace=True, + axis=1, + ) + + out_mztab_psh = pd.concat([out_mztab_psh, scan_matched]) else: - scan_matched.drop( - [ - "scan_RT", - "scan_opt_global_spectrum_reference", - "scan_exp_mass_to_charge", - "unassigned_matched", - ], - inplace=True, - axis=1, - ) - - out_mztab_psh = pd.concat([out_mztab_psh, scan_matched]) + logger.info("MS2.Scan column isn't in DIA-NN report, only Matching MS2 by RT values") + out_mztab_psh = pd.concat([out_mztab_psh, rt_matched]) del report From 12dae7270450a96905c7087b7b3d24cd9e843641 Mon Sep 17 00:00:00 2001 From: Chengxin Dai <37200167+daichengxin@users.noreply.github.com> Date: Tue, 18 Feb 2025 16:04:59 +0800 Subject: [PATCH 2/9] Update diann2mztab.py --- quantmsutils/diann/diann2mztab.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/quantmsutils/diann/diann2mztab.py b/quantmsutils/diann/diann2mztab.py index cdb535c..75e995f 100644 --- a/quantmsutils/diann/diann2mztab.py +++ b/quantmsutils/diann/diann2mztab.py @@ -320,7 +320,7 @@ def report(self) -> os.PathLike: # DIA-NN 1.8.1 return tsv format, but DIA-NN 2.0 only return parquet try: return self.find_first_file_with_suffix("report.tsv") - except FileNotFoundError as e: + except FileNotFoundError: return self.find_first_file_with_suffix("report.parquet") @property From f0156c979be77200e509938cd7ef5f3388c6f606 Mon Sep 17 00:00:00 2001 From: Chengxin Dai <37200167+daichengxin@users.noreply.github.com> Date: Wed, 19 Feb 2025 15:03:27 +0800 Subject: [PATCH 3/9] Update diann2mztab.py --- quantmsutils/diann/diann2mztab.py | 112 ++++++++++++++++++------------ 1 file changed, 66 insertions(+), 46 deletions(-) diff --git a/quantmsutils/diann/diann2mztab.py b/quantmsutils/diann/diann2mztab.py index 75e995f..bf1f180 100644 --- a/quantmsutils/diann/diann2mztab.py +++ b/quantmsutils/diann/diann2mztab.py @@ -414,7 +414,7 @@ def convert_to_mztab( sep="\t", header=0, ) - prh = mztab_prh(report, pg, index_ref, database, fasta_df) + prh = mztab_prh(report, pg, index_ref, database, fasta_df, self.diann_version) del pg pr = pd.read_csv( self.pr_matrix, @@ -443,10 +443,8 @@ def main_report_df(self, qvalue_threshold: float) -> pd.DataFrame: "Protein.Group", "Protein.Names", "Protein.Ids", - "First.Protein.Description", "PG.MaxLFQ", "RT", - "MS2.Scan", "Global.Q.Value", "Lib.Q.Value", "PEP", @@ -457,20 +455,13 @@ def main_report_df(self, qvalue_threshold: float) -> pd.DataFrame: "Stripped.Sequence", "Precursor.Charge", "Precursor.Quantity", - "Global.PG.Q.Value", - "MS2.Scan", + "Global.PG.Q.Value" ] - if ".tsv" in self.report.suffix: - report = pd.read_csv(self.report, sep="\t", header=0, usecols=remain_cols) + if "2.0" not in self.diann_version: + report = pd.read_csv(self.report, sep="\t", header=0, + usecols=remain_cols + ["MS2.Scan"]) else: - report = pd.read_parquet(self.report) - # filter decoy - if "Decoy" in report.columns: - logger.debug( - f"Filtering decoy from report: {len(report)} rows" - ) - report = report[report["Decoy"] == 0] - logger.debug(f"Report filtered, {len(report)} rows remaining") + report = pd.read_parquet(self.report, columns=remain_cols + ["Decoy"]) # filter based on qvalue parameter for downstream analysiss logger.debug( @@ -680,7 +671,7 @@ def mztab_mtd(index_ref, dia_params, fasta, charge, missed_cleavages, diann_vers return out_mztab_mtd_t, database -def mztab_prh(report, pg, index_ref, database, fasta_df): +def mztab_prh(report, pg, index_ref, database, fasta_df, diann_version): """ Construct PRH sub-table. @@ -694,6 +685,8 @@ def mztab_prh(report, pg, index_ref, database, fasta_df): :type database: str :param fasta_df: A dataframe contains protein IDs, sequences and lengths :type fasta_df: pandas.core.frame.DataFrame + :param diann_version: DIA-NN version + :type diann_version: str :return: PRH sub-table :rtype: pandas.core.frame.DataFrame """ @@ -704,7 +697,11 @@ def mztab_prh(report, pg, index_ref, database, fasta_df): f" input index_ref shape: {index_ref.shape}," f" input fasta_df shape: {fasta_df.shape}" ) - file = list(pg.columns[5:]) + # DIA-NN 2.0 PG file doesn't have Protein IDs column + if "2.0" not in diann_version: + file = list(pg.columns[5:]) + else: + file = list(pg.columns[4:]) col = {} for i in file: col[i] = ( @@ -966,6 +963,7 @@ def mztab_peh( } name_mapper = name_mapper_builder(subname_mapper) tmp.rename(columns=name_mapper, inplace=True) + print(tmp.columns) out_mztab_peh = out_mztab_peh.merge( tmp.rename(columns={"precursor.Index": "pr_id"}), on="pr_id", @@ -1159,40 +1157,62 @@ def __find_info(directory, n): del report # Score at PSM level: Q.Value - out_mztab_psh = out_mztab_psh[ - [ - "Stripped.Sequence", - "Protein.Group", - "Q.Value", - "RT", - "Precursor.Charge", - "Calculate.Precursor.Mz", - "exp_mass_to_charge", - "Modified.Sequence", - "PEP", - "Global.Q.Value", - "Global.Q.Value", - "opt_global_spectrum_reference", - "ms_run", - ] - ] - out_mztab_psh.columns = [ - "sequence", - "accession", - "search_engine_score[1]", - "retention_time", - "charge", - "calc_mass_to_charge", + psm_columns = [ + "Stripped.Sequence", + "Protein.Group", + "Q.Value", + "RT", + "Precursor.Charge", + "Calculate.Precursor.Mz", "exp_mass_to_charge", - "opt_global_cv_MS:1000889_peptidoform_sequence", - "opt_global_SpecEValue_score", - "opt_global_q-value", - "opt_global_q-value_score", + "Modified.Sequence", + "PEP", + "Global.Q.Value", + "Global.Q.Value", "opt_global_spectrum_reference", "ms_run", ] - out_mztab_psh.loc[:, "opt_global_cv_MS:1002217_decoy_peptide"] = "0" + if "Decoy" in out_mztab_psh.columns: + out_mztab_psh = out_mztab_psh[ + psm_columns + ["Decoy"] + ] + out_mztab_psh.columns = [ + "sequence", + "accession", + "search_engine_score[1]", + "retention_time", + "charge", + "calc_mass_to_charge", + "exp_mass_to_charge", + "opt_global_cv_MS:1000889_peptidoform_sequence", + "opt_global_SpecEValue_score", + "opt_global_q-value", + "opt_global_q-value_score", + "opt_global_spectrum_reference", + "ms_run", + "opt_global_cv_MS:1002217_decoy_peptide" + ] + else: + out_mztab_psh = out_mztab_psh[ + psm_columns + ] + out_mztab_psh.columns = [ + "sequence", + "accession", + "search_engine_score[1]", + "retention_time", + "charge", + "calc_mass_to_charge", + "exp_mass_to_charge", + "opt_global_cv_MS:1000889_peptidoform_sequence", + "opt_global_SpecEValue_score", + "opt_global_q-value", + "opt_global_q-value_score", + "opt_global_spectrum_reference", + "ms_run" + ] + out_mztab_psh.loc[:, "opt_global_cv_MS:1002217_decoy_peptide"] = 0 out_mztab_psh.loc[:, "PSM_ID"] = out_mztab_psh.index out_mztab_psh.loc[:, "unique"] = out_mztab_psh.apply( lambda x: "0" if ";" in str(x["accession"]) else "1", From a8b3ed38f7aa2499195c9022f9415e83c3906051 Mon Sep 17 00:00:00 2001 From: Chengxin Dai <37200167+daichengxin@users.noreply.github.com> Date: Thu, 20 Feb 2025 22:33:10 +0800 Subject: [PATCH 4/9] fixed error and clean codes --- quantmsutils/diann/diann2mztab.py | 302 +++++++++++++----------------- 1 file changed, 129 insertions(+), 173 deletions(-) diff --git a/quantmsutils/diann/diann2mztab.py b/quantmsutils/diann/diann2mztab.py index bf1f180..a130eb2 100644 --- a/quantmsutils/diann/diann2mztab.py +++ b/quantmsutils/diann/diann2mztab.py @@ -48,15 +48,15 @@ @click.option("--enable_diann2mztab", "-e", is_flag=True) @click.pass_context def diann2mztab( - ctx, - folder, - exp_design, - dia_params, - diann_version, - charge, - missed_cleavages, - qvalue_threshold, - enable_diann2mztab + ctx, + folder, + exp_design, + dia_params, + diann_version, + charge, + missed_cleavages, + qvalue_threshold, + enable_diann2mztab ): """ Convert DIA-NN output to MSstats, Triqler or mzTab. @@ -94,7 +94,12 @@ def diann2mztab( ] logger.debug("Converting to MSstats format...") - out_msstats = report[msstats_columns_keep] + # Filter Decoy precursors for MSstats analysis, the decoy PSMs are only store in mzTab + if "Decoy" in report.columns: + out_msstats = report[report["Decoy"] != 1] + out_msstats = out_msstats[msstats_columns_keep] + else: + out_msstats = report[msstats_columns_keep] out_msstats.columns = [ "ProteinName", "PeptideSequence", @@ -104,9 +109,11 @@ def diann2mztab( ] out_msstats = out_msstats[out_msstats["Intensity"] != 0] - # Q: What is this line doing? + # Convert Unimod ID to modification name and replace ^ with DECOY_ out_msstats.loc[:, "PeptideSequence"] = out_msstats.apply( - lambda x: AASequence.fromString(x["PeptideSequence"]).toString(), axis=1 + lambda x: AASequence.fromString(x["PeptideSequence"]).toString() if "^" not in x["PeptideSequence"] else + "^" + AASequence.fromString(x["PeptideSequence"].replace("^", "")).toString(), + axis=1 ) out_msstats["FragmentIon"] = "NA" out_msstats["ProductCharge"] = "0" @@ -222,7 +229,7 @@ def get_exp_design_dfs(exp_design_file): f_table = pd.DataFrame(f_table, columns=f_header) f_table.loc[:, "run"] = f_table.apply(lambda x: _true_stem(x["Spectra_Filepath"]), axis=1) - s_table = [i.replace("\n", "").split("\t") for i in data[empty_row + 1 :]][1:] + s_table = [i.replace("\n", "").split("\t") for i in data[empty_row + 1:]][1:] s_header = data[empty_row + 1].replace("\n", "").split("\t") s_data_frame = pd.DataFrame(s_table, columns=s_header) @@ -250,6 +257,9 @@ def compute_mass_modified_peptide(peptide_seq: str) -> float: "O": "X[237.14773053312]", # 255.158295 - 17.003288 - 1.00727646688 } for aa in peptide_seq: + # DIA-NN Decoy tag + if aa == "^": + continue # Check if the letter is in aminoacid if aa == "(": not_mod = False @@ -259,35 +269,36 @@ def compute_mass_modified_peptide(peptide_seq: str) -> float: if aa in aa_mass and not_mod: aa = aa_mass[aa] elif ( - aa - not in [ - "G", - "A", - "V", - "L", - "I", - "F", - "M", - "P", - "W", - "S", - "C", - "T", - "Y", - "N", - "Q", - "D", - "E", - "K", - "R", - "H", - ] - and not_mod - and aa != ")" + aa + not in [ + "G", + "A", + "V", + "L", + "I", + "F", + "M", + "P", + "W", + "S", + "C", + "T", + "Y", + "N", + "Q", + "D", + "E", + "K", + "R", + "H", + ] + and not_mod + and aa != ")" ): logger.info(f"Unknown amino acid with mass not known:{aa}") peptide_parts.append(aa) new_peptide_seq = "".join(peptide_parts) + # DIA-NN maybe output "^", see https://github.com/vdemichev/DiaNN/issues/1404 mass = AASequence.fromString(new_peptide_seq).getMonoWeight() logger.debug(new_peptide_seq + ":" + str(mass)) return mass @@ -363,13 +374,13 @@ def validate_diann_version(self) -> None: raise ValueError(f"Unsupported DIANN version {self.diann_version}") def convert_to_mztab( - self, - report, - f_table, - charge: int, - missed_cleavages: int, - dia_params: List[Any], - out: Union[os.PathLike, str], + self, + report, + f_table, + charge: int, + missed_cleavages: int, + dia_params: List[Any], + out: Union[os.PathLike, str], ) -> None: logger.info("Converting to mzTab") self.validate_diann_version() @@ -392,21 +403,21 @@ def convert_to_mztab( index_ref = f_table.copy() index_ref.rename( columns={ - "Fraction_Group": "ms_run", + "Fraction_Group": "assay", "Sample": "study_variable", "run": "Run", }, inplace=True, ) - index_ref["ms_run"] = index_ref["ms_run"].astype("int") + index_ref["assay"] = index_ref["assay"].astype("int") index_ref["study_variable"] = index_ref["study_variable"].astype("int") report = report.merge( - index_ref[["ms_run", "Run", "study_variable"]], + index_ref[["assay", "Run", "study_variable"]], on="Run", validate="many_to_one", ) - mtd, database = mztab_mtd( + mtd, database, ms_run_index = mztab_mtd( index_ref, dia_params, str(self.fasta), charge, missed_cleavages, self.diann_version ) pg = pd.read_csv( @@ -422,9 +433,9 @@ def convert_to_mztab( header=0, ) precursor_list = list(report["Precursor.Id"].unique()) - peh = mztab_peh(report, pr, precursor_list, index_ref, database) + peh = mztab_peh(report, pr, precursor_list, index_ref, database, ms_run_index) del pr - psh = mztab_psh(report, str(self.base_path), database) + psh = mztab_psh(report, str(self.base_path), database, ms_run_index) del report mtd.loc["", :] = "" prh.loc[len(prh) + 1, :] = "" @@ -478,8 +489,8 @@ def main_report_df(self, qvalue_threshold: float) -> pd.DataFrame: } mass_vector = report["Modified.Sequence"].map(uniq_masses) report["Calculate.Precursor.Mz"] = ( - mass_vector + (PROTON_MASS_U * report["Precursor.Charge"]) - ) / report["Precursor.Charge"] + mass_vector + (PROTON_MASS_U * report["Precursor.Charge"]) + ) / report["Precursor.Charge"] logger.debug("Indexing Precursors") # Making the map is 1500x faster @@ -580,16 +591,16 @@ def mztab_mtd(index_ref, dia_params, fasta, charge, missed_cleavages, diann_vers out_mztab_mtd.loc[1, "software[1]-setting[1]"] = fasta out_mztab_mtd.loc[1, "software[1]-setting[2]"] = "db_version:null" out_mztab_mtd.loc[1, "software[1]-setting[3]"] = ( - "fragment_mass_tolerance:" + fragment_mass_tolerance + "fragment_mass_tolerance:" + fragment_mass_tolerance ) out_mztab_mtd.loc[1, "software[1]-setting[4]"] = ( - "fragment_mass_tolerance_unit:" + fragment_mass_tolerance_unit + "fragment_mass_tolerance_unit:" + fragment_mass_tolerance_unit ) out_mztab_mtd.loc[1, "software[1]-setting[5]"] = ( - "precursor_mass_tolerance:" + precursor_mass_tolerance + "precursor_mass_tolerance:" + precursor_mass_tolerance ) out_mztab_mtd.loc[1, "software[1]-setting[6]"] = ( - "precursor_mass_tolerance_unit:" + precursor_mass_tolerance_unit + "precursor_mass_tolerance_unit:" + precursor_mass_tolerance_unit ) out_mztab_mtd.loc[1, "software[1]-setting[7]"] = "enzyme:" + enzyme out_mztab_mtd.loc[1, "software[1]-setting[8]"] = "enzyme_term_specificity:full" @@ -625,29 +636,35 @@ def mztab_mtd(index_ref, dia_params, fasta, charge, missed_cleavages, diann_vers out_mztab_mtd.loc[1, "protein-quantification_unit"] = "[, , Abundance, ]" out_mztab_mtd.loc[1, "peptide-quantification_unit"] = "[, , Abundance, ]" - for i in range(1, max(index_ref["ms_run"]) + 1): + # Construct ms run index + ms_run_index = {} + for i in range(1, len(index_ref["Run"]) + 1): out_mztab_mtd.loc[1, "ms_run[" + str(i) + "]-format"] = "[MS, MS:1000584, mzML file, ]" out_mztab_mtd.loc[1, "ms_run[" + str(i) + "]-location"] = ( - "file://" + index_ref[index_ref["ms_run"] == i]["Spectra_Filepath"].values[0] + "file://" + index_ref.loc[i - 1, "Spectra_Filepath"] ) out_mztab_mtd.loc[1, "ms_run[" + str(i) + "]-id_format"] = ( "[MS, MS:1000777, spectrum identifier nativeID format, ]" ) + ms_run_index[index_ref.loc[i - 1, "Run"]] = i + + # Construct assay index by sample + for i in range(1, max(index_ref["study_variable"]) + 1): + assays_run = [] + for j in list(index_ref[index_ref["study_variable"] == i].index): + assays_run.append("ms_run[" + str(j + 1) + "]") + out_mztab_mtd.loc[1, "assay[" + str(i) + "]-quantification_reagent"] = ( "[MS, MS:1002038, unlabeled sample, ]" ) - out_mztab_mtd.loc[1, "assay[" + str(i) + "]-ms_run_ref"] = "ms_run[" + str(i) + "]" + out_mztab_mtd.loc[1, "assay[" + str(i) + "]-ms_run_ref"] = ",".join(assays_run) + # Construct assay index by sample with warnings.catch_warnings(): warnings.simplefilter("ignore") # This is used here in order to ignore performance warnings from pandas. for i in range(1, max(index_ref["study_variable"]) + 1): - study_variable = [] - for j in list(index_ref[index_ref["study_variable"] == i]["ms_run"].values): - study_variable.append("assay[" + str(j) + "]") - out_mztab_mtd.loc[1, "study_variable[" + str(i) + "]-assay_refs"] = ",".join( - study_variable - ) + out_mztab_mtd.loc[1, "study_variable[" + str(i) + "]-assay_refs"] = "assay[" + str(i) + "]" out_mztab_mtd.loc[1, "study_variable[" + str(i) + "]-description"] = ( "no description given" ) @@ -668,7 +685,7 @@ def mztab_mtd(index_ref, dia_params, fasta, charge, missed_cleavages, diann_vers out_mztab_mtd_t.insert(0, "index", index) database = os.path.basename(fasta.split(".")[-2]) - return out_mztab_mtd_t, database + return out_mztab_mtd_t, database, ms_run_index def mztab_prh(report, pg, index_ref, database, fasta_df, diann_version): @@ -702,15 +719,6 @@ def mztab_prh(report, pg, index_ref, database, fasta_df, diann_version): file = list(pg.columns[5:]) else: file = list(pg.columns[4:]) - col = {} - for i in file: - col[i] = ( - "protein_abundance_assay[" - + str(index_ref[index_ref["Run"] == _true_stem(i)]["ms_run"].values[0]) - + "]" - ) - - pg.rename(columns=col, inplace=True) logger.debug("Classifying results type ...") pg["opt_global_result_type"] = "single_protein" @@ -748,7 +756,7 @@ def mztab_prh(report, pg, index_ref, database, fasta_df, diann_version): protein_details_df = out_mztab_prh[ out_mztab_prh["opt_global_result_type"] == "indistinguishable_protein_group" - ] + ] prh_series = ( protein_details_df["Protein.Group"] .str.split(";", expand=True) @@ -765,12 +773,20 @@ def mztab_prh(report, pg, index_ref, database, fasta_df, diann_version): if len(protein_details_df) > 0: logger.info(f"Found {len(protein_details_df)} indistinguishable protein groups") # The Following line fails if there are no indistinguishable protein groups - protein_details_df.loc[:, "col"] = "protein_details" + protein_details_df.loc[:, "opt_global_result_type"] = "protein_details" # protein_details_df = protein_details_df[-protein_details_df["accession"].str.contains("-")] - out_mztab_prh = pd.concat([out_mztab_prh, protein_details_df]).reset_index(drop=True) + out_mztab_prh = pd.concat([out_mztab_prh, protein_details_df], ignore_index=True).reset_index(drop=True) else: logger.info("No indistinguishable protein groups found") + # Rename file name to protein abundance, and calculate mean of ms runs from same sample + for i in range(1, max(index_ref["study_variable"]) + 1): + assays_run = [] + for j in list(index_ref[index_ref["study_variable"] == i].index): + assays_run.append(os.path.basename(index_ref.loc[j, "Spectra_Filepath"])) + out_mztab_prh["protein_abundance_assay[" + str(i) + "]"] = out_mztab_prh[assays_run].mean(axis=1) + out_mztab_prh.drop(assays_run, inplace=True, axis=1) + logger.debug("Calculating protein coverage (bottleneck)...") # This is a bottleneck # reimplementation runs in 67s vs 137s (old) in my data @@ -858,11 +874,12 @@ def mztab_prh(report, pg, index_ref, database, fasta_df, diann_version): def mztab_peh( - report: pd.DataFrame, - pr: pd.DataFrame, - precursor_list: List[str], - index_ref: pd.DataFrame, - database: os.PathLike, + report: pd.DataFrame, + pr: pd.DataFrame, + precursor_list: List[str], + index_ref: pd.DataFrame, + database: os.PathLike, + ms_run_index: Dict ) -> pd.DataFrame: """ Construct PEH sub-table. @@ -877,6 +894,8 @@ def mztab_peh( :type index_ref: pandas.core.frame.DataFrame :param database: Path to fasta file :type database: str + :param ms_run_index: Dict for MS run index + :type ms_run_index: dict :return: PEH sub-table :rtype: pandas.core.frame.DataFrame """ @@ -935,7 +954,6 @@ def mztab_peh( ] for i in null_col: out_mztab_peh.loc[:, i] = "null" - out_mztab_peh.loc[:, "opt_global_cv_MS:1002217_decoy_peptide"] = "0" logger.debug("Matching precursor IDs...") # Pre-calculating the indices and using a lookup table drops run time from @@ -948,14 +966,18 @@ def mztab_peh( logger.debug("Getting scores per run") # This implementation is 422-700x faster than the apply-based one tmp = ( - report.groupby(["precursor.Index", "ms_run"]) + report.groupby(["precursor.Index", "Run"]) .agg({"Q.Value": ["min"]}) .reset_index() - .pivot(columns=["ms_run"], index="precursor.Index") + .pivot(columns=["Run"], index="precursor.Index") .reset_index() ) + tmp.columns = pd.Index( - ["::".join([str(s) for s in col]).strip() for col in tmp.columns.values] + ["::".join([str(s) if s in ["precursor.Index", + "Q.Value", + "min", + ""] else str(ms_run_index[s]) for s in col]).strip() for col in tmp.columns.values] ) subname_mapper = { "precursor.Index::::": "precursor.Index", @@ -963,7 +985,6 @@ def mztab_peh( } name_mapper = name_mapper_builder(subname_mapper) tmp.rename(columns=name_mapper, inplace=True) - print(tmp.columns) out_mztab_peh = out_mztab_peh.merge( tmp.rename(columns={"precursor.Index": "pr_id"}), on="pr_id", @@ -1019,6 +1040,14 @@ def mztab_peh( out_mztab_peh.loc[:, "PEH"] = "PEP" out_mztab_peh.loc[:, "database"] = str(database) index = out_mztab_peh.loc[:, "PEH"] + # Merge Decoy, I think Decoy is unique for Precursor. + if "Decoy" in report.columns: + out_mztab_peh = pd.merge(out_mztab_peh, report[["Precursor.Id", "Decoy"]].drop_duplicates(), + on="Precursor.Id", how="inner") + out_mztab_peh.rename(columns={"Decoy": "opt_global_cv_MS:1002217_decoy_peptide"}, inplace=True) + else: + out_mztab_peh.loc[:, "opt_global_cv_MS:1002217_decoy_peptide"] = "0" + out_mztab_peh.drop(["PEH", "Precursor.Id", "Genes", "pr_id"], axis=1, inplace=True) out_mztab_peh.insert(0, "PEH", index) out_mztab_peh.fillna("null", inplace=True) @@ -1030,7 +1059,7 @@ def mztab_peh( return out_mztab_peh -def mztab_psh(report, folder, database): +def mztab_psh(report, folder, database, ms_run_index): """ Construct PSH sub-table. @@ -1042,6 +1071,8 @@ def mztab_psh(report, folder, database): :type folder: str :param database: Path to fasta file :type database: str + :param ms_run_index: Dict for MS run index + :type ms_run_index: dict :return: PSH sub-table :rtype: pandas.core.frame.DataFrame """ @@ -1069,7 +1100,6 @@ def __find_info(directory, n): file = __find_info(folder, n) target = pd.read_parquet(file) - # Read original parquet columns from mzml_stats target = target[target[MS_LEVEL] == 2] target.reset_index(inplace=True, drop=True) @@ -1086,8 +1116,8 @@ def __find_info(directory, n): # Standardize spectrum identifier format for bruker data if not isinstance(target.loc[0, "opt_global_spectrum_reference"], str): target.loc[:, "opt_global_spectrum_reference"] = "scan=" + target.loc[ - :, "opt_global_spectrum_reference" - ].astype(str) + :, "opt_global_spectrum_reference" + ].astype(str) # TODO seconds returned from precursor.getRT() # RT column of DIA-NN 2.0.* is float32 type, but RT column of DIA-NN 1.8.* is float64 type @@ -1170,13 +1200,13 @@ def __find_info(directory, n): "Global.Q.Value", "Global.Q.Value", "opt_global_spectrum_reference", - "ms_run", + "Run", ] if "Decoy" in out_mztab_psh.columns: out_mztab_psh = out_mztab_psh[ psm_columns + ["Decoy"] - ] + ] out_mztab_psh.columns = [ "sequence", "accession", @@ -1242,14 +1272,14 @@ def __find_info(directory, n): ) out_mztab_psh.loc[:, "spectra_ref"] = out_mztab_psh.apply( - lambda x: "ms_run[{}]:".format(x["ms_run"]) + x["opt_global_spectrum_reference"], + lambda x: "ms_run[{}]:".format(ms_run_index[x["ms_run"]]) + x["opt_global_spectrum_reference"], axis=1, result_type="expand", ) out_mztab_psh.loc[:, "opt_global_cv_MS:1000889_peptidoform_sequence"] = out_mztab_psh.apply( lambda x: AASequence.fromString( - x["opt_global_cv_MS:1000889_peptidoform_sequence"] + x["opt_global_cv_MS:1000889_peptidoform_sequence"].replace("^", "") ).toString(), axis=1, result_type="expand", @@ -1269,24 +1299,6 @@ def __find_info(directory, n): return out_mztab_psh -def add_info(target, index_ref): - """ - On the basis of f_table, two columns "ms_run" and "study_variable" are added for matching. - - :param target: The value of "Run" column in f_table - :type target: str - :param index_ref: A dataframe on the basis of f_table - :type index_ref: pandas.core.frame.DataFrame - :return: A tuple contains ms_run and study_variable - :rtype: tuple - """ - match = index_ref[index_ref["Run"] == target] - ms_run = match["ms_run"].values[0] - study_variable = match["study_variable"].values[0] - - return ms_run, study_variable - - def classify_result_type(target): """Classify proteins @@ -1300,62 +1312,6 @@ def classify_result_type(target): return "single_protein" -def match_in_report(report, target, max_, flag, level): - """ - This function is used to match the columns "ms_run" and "study_variable" from the report and - get the corresponding information for the mztab ms_run and study_values metadata values. - - :param report: Dataframe for Dia-NN main report - :type report: pandas.core.frame.DataFrame - :param target: The value of "pr_id" column in out_mztab_PEH(level="peptide") or the "accession" column in out_mztab_PRH(level="protein") - :type target: str - :param max_: max_assay or max_study_variable - :type max_: int - :param flag: Match the "study_variable" column(flag=1) or the "ms_run" column(flag=0) in the filter result - :type flag: int - :param level: "pep" or "protein" - :type level: str - :return: A tuple contains multiple messages - :rtype: tuple - """ # noqa - if flag == 1 and level == "pep": - result = report[report["precursor.Index"] == target] - peh_params = [] - for i in range(1, max_ + 1): - match = result[result["study_variable"] == i] - peh_params.extend( - [ - match["Precursor.Normalised"].mean(), - "null", - "null", - "null", - match["RT.Start"].mean(), - ] - ) - - return tuple(peh_params) - - if flag == 0 and level == "pep": - result = report[report["precursor.Index"] == target] - q_value = [] - for i in range(1, max_ + 1): - match = result[result["ms_run"] == i] - q_value.append( - match["Q.Value"].values[0] if match["Q.Value"].values.size > 0 else np.nan - ) - - return tuple(q_value) - - if flag == 1 and level == "protein": - result = report[report["Protein.Group"] == target] - prh_params = [] - for i in range(1, max_ + 1): - match = result[result["study_variable"] == i] - prh_params.extend([match["PG.MaxLFQ"].mean(), "null", "null"]) - - return tuple(prh_params) - - class ModScoreLooker: """ Class used to cache the lookup table of accessions to best scores and their @@ -1605,7 +1561,7 @@ def calculate_coverage(ref_sequence: str, sequences: Set[str]): for start, length in sorted(zip(starts, lengths)): if merged_starts and merged_starts[-1] + merged_lengths[-1] >= start: merged_lengths[-1] = ( - max(merged_starts[-1] + merged_lengths[-1], start + length) - merged_starts[-1] + max(merged_starts[-1] + merged_lengths[-1], start + length) - merged_starts[-1] ) else: merged_starts.append(start) @@ -1617,7 +1573,7 @@ def calculate_coverage(ref_sequence: str, sequences: Set[str]): def calculate_protein_coverages( - report: pd.DataFrame, out_mztab_prh: pd.DataFrame, fasta_df: pd.DataFrame + report: pd.DataFrame, out_mztab_prh: pd.DataFrame, fasta_df: pd.DataFrame ) -> List[str]: """Calculates protein coverages for the PRH table. From d2a9783f8621a6dbf24413e9c84851e29a63c546 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Thu, 20 Feb 2025 14:53:58 +0000 Subject: [PATCH 5/9] REmove the feature commands -> quantms-rescoring --- quantmsutils/features/__init__.py | 0 quantmsutils/features/sage_feature.py | 38 --------- quantmsutils/features/snr.py | 113 -------------------------- 3 files changed, 151 deletions(-) delete mode 100644 quantmsutils/features/__init__.py delete mode 100644 quantmsutils/features/sage_feature.py delete mode 100644 quantmsutils/features/snr.py diff --git a/quantmsutils/features/__init__.py b/quantmsutils/features/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/quantmsutils/features/sage_feature.py b/quantmsutils/features/sage_feature.py deleted file mode 100644 index 14c9227..0000000 --- a/quantmsutils/features/sage_feature.py +++ /dev/null @@ -1,38 +0,0 @@ -import click -import pandas as pd -import pyopenms as oms - - -@click.command("sage2feature") -@click.option("--idx_file", "-i", help="Input idXML file") -@click.option("--output_file", "-o", help="Output idXML file") -@click.option("--feat_file", "-f", help="Input feature table file") -@click.pass_context -def add_sage_feature(ctx, idx_file: str, output_file: str, feat_file: str): - """ - Add extra features in features idXML. Adding extra feature in Sage isn't known input for PSMFeatureExtractor - - :param ctx: click context - :param idx_file: Original idXML file - :param output_file: Outpuf file with the extra feature - :param feat_file: Feature file from Sage - :return: None - """ - extra_feat = [] - feat = pd.read_csv(feat_file, sep="\t") - for _, row in feat.iterrows(): - if row["feature_generator"] == "psm_file": - continue - else: - extra_feat.append(row["feature_name"]) - print("Adding extra feature: {}".format(extra_feat)) - protein_ids = [] - peptide_ids = [] - oms.IdXMLFile().load(idx_file, protein_ids, peptide_ids) - search_parameters = protein_ids[0].getSearchParameters() - features = search_parameters.getMetaValue("extra_features") - extra_features = features + "," + ",".join(extra_feat) - search_parameters.setMetaValue("extra_features", extra_features) - protein_ids[0].setSearchParameters(search_parameters) - oms.IdXMLFile().store(output_file, protein_ids, peptide_ids) - print("Done") diff --git a/quantmsutils/features/snr.py b/quantmsutils/features/snr.py deleted file mode 100644 index c0787b1..0000000 --- a/quantmsutils/features/snr.py +++ /dev/null @@ -1,113 +0,0 @@ -import re - -import click -import numpy as np -from pyopenms import MzMLFile, MSExperiment, SpectrumLookup -import pyopenms as oms -from scipy.stats import entropy - - -def compute_signal_to_noise(intensities): - """ - Compute the signal-to-noise ratio for a given spectrum - :param intensities: intensity values - :return: - """ - rmsd = np.sqrt(np.mean(np.square(intensities))) - - # Calculate SNR - snr = np.max(intensities) / rmsd - - return snr - - -@click.command("spectrum2feature") -@click.option("--ms_path", type=click.Path(exists=True), required=True) -@click.option( - "--idxml", - help="The idxml file with the PSMs corresponding to the mzML file", - required=True, -) -@click.option( - "--output", - help="The output idXML file with the signal-to-noise ratio", - required=True, -) -@click.pass_context -def spectrum2feature(ctx, ms_path: str, idxml: str, output: str) -> None: - """ - The snr function computes the signal-to-noise ratio for a given mass spectrometry file and its corresponding - identification file. - - :param ctx: Click context - :param ms_path: A string specifying the path to the mass spectrometry file. - :param idxml: A string specifying the path to the identification file. - :param output: A string specifying the path to the output file. - """ - - mzml_file = MSExperiment() - MzMLFile().load(ms_path, mzml_file) - - lookup = SpectrumLookup() - lookup.readSpectra(mzml_file, "scan=(?\\d+)") - - protein_ids = [] - peptide_ids = [] - oms.IdXMLFile().load(idxml, protein_ids, peptide_ids) - - result_peptides = [] - for peptide in peptide_ids: - spectrum_reference = peptide.getMetaValue("spectrum_reference") - scan_number = int(re.findall(r"(spectrum|scan)=(\d+)", spectrum_reference)[0][1]) - - try: - index = lookup.findByScanNumber(scan_number) - spectrum = mzml_file.getSpectrum(index) - intensity_array = spectrum.get_peaks()[1] - mz_array = spectrum.get_peaks()[0] - - ## Compute signal to noise ratio - snr = compute_signal_to_noise(intensity_array) - - # Spectral Entropy - tic = np.sum(intensity_array) - normalized_intensities = intensity_array / tic - spectral_entropy = entropy(normalized_intensities) - - # Fraction of TIC in Top 10 Peaks - top_n_peaks = np.sort(intensity_array)[-10:] - fraction_tic_top_10 = np.sum(top_n_peaks) / tic - - # Intensity Weighted m/z Standard Deviation - weighted_mean_mz = np.sum(np.array(mz_array) * normalized_intensities) - weighted_std_mz = np.sqrt( - np.sum(normalized_intensities * (np.array(mz_array) - weighted_mean_mz) ** 2) - ) - - for hit in peptide.getHits(): - hit.setMetaValue("quantms:SNR", str(snr)) - hit.setMetaValue("quantms:SpectralEntropy", str(spectral_entropy)) - hit.setMetaValue("quantms:FracTICinTop10Peaks", str(fraction_tic_top_10)) - hit.setMetaValue("quantms:WeightedStdMz", str(weighted_std_mz)) - - # update hit in peptidehits list - peptide.setHits([hit]) - ## update peptide in peptide_ids list - result_peptides.append(peptide) - - except IndexError: - message = "scan_number" + str(scan_number) + "not found in file: " + ms_path - print(message) - - # Add quantms:SNR as a feature - search_parameters = protein_ids[0].getSearchParameters() - features = search_parameters.getMetaValue("extra_features") - extra_features = ( - features - + ",quantms:SNR,quantms:SpectralEntropy,quantms:FracTICinTop10Peaks,quantms:WeightedStdMz" - ) - search_parameters.setMetaValue("extra_features", extra_features) - protein_ids[0].setSearchParameters(search_parameters) - - oms.IdXMLFile().store(output, protein_ids, peptide_ids) - print("The output file with the signal-to-noise ratio has been saved at: ", output) From e85331c8f6f781b9399b2df60ba45a69c89566a1 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Thu, 20 Feb 2025 14:56:38 +0000 Subject: [PATCH 6/9] REmove the feature commands -> quantms-rescoring --- README.md | 5 ---- quantmsutils/quantmsutilsc.py | 4 ---- tests/test_commands.py | 44 ----------------------------------- 3 files changed, 53 deletions(-) diff --git a/README.md b/README.md index d9faa94..4f848e9 100644 --- a/README.md +++ b/README.md @@ -27,11 +27,6 @@ The following functionalities are available in the package: - `openms2sample` - Extra sample information from OpenMS experimental design file. An example of OpenMS experimental design file is available [here](https://github.com/bigbio/quantms-utils/blob/dev/tests/test_data/BSA_design_urls.tsv). - `checksamplesheet` - Check the sample sheet for errors and inconsistencies. The experimental design coult be an OpenMS experimental design file or and SDRF file. -### Features to percolator scripts - -- `sage2feature` - The add_sage_feature function enhances an idXML file by appending additional features from a Sage feature table, excluding those generated by 'psm_file'. -- `spectrum2feature` - Add the signal-to-noise ratio (SNR) to the feature table for percolator. - ### Other scripts - `psmconvert` - The convert_psm function converts peptide spectrum matches (PSMs) from an idXML file to a CSV file, optionally filtering out decoy matches. It extracts and processes data from both the idXML and an associated spectra file, handling multiple search engines and scoring systems. diff --git a/quantmsutils/quantmsutilsc.py b/quantmsutils/quantmsutilsc.py index d258a2e..815180f 100644 --- a/quantmsutils/quantmsutilsc.py +++ b/quantmsutils/quantmsutilsc.py @@ -2,12 +2,10 @@ from quantmsutils.diann.diann2mztab import diann2mztab from quantmsutils.diann.dianncfg import dianncfg -from quantmsutils.features.sage_feature import add_sage_feature from quantmsutils.mzml.mzml_statistics import mzml_statistics from quantmsutils.psm.psm_conversion import convert_psm from quantmsutils.sdrf.check_samplesheet import checksamplesheet from quantmsutils.sdrf.extract_sample import extract_sample_from_expdesign -from quantmsutils.features.snr import spectrum2feature from quantmsutils import __version__ CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"]) @@ -23,12 +21,10 @@ def cli(): cli.add_command(dianncfg) cli.add_command(diann2mztab) -cli.add_command(add_sage_feature) cli.add_command(mzml_statistics) cli.add_command(extract_sample_from_expdesign) cli.add_command(checksamplesheet) cli.add_command(convert_psm) -cli.add_command(spectrum2feature) def main(): diff --git a/tests/test_commands.py b/tests/test_commands.py index d49032f..b648fc3 100644 --- a/tests/test_commands.py +++ b/tests/test_commands.py @@ -11,14 +11,6 @@ def test_create_diann_cfg_help(): assert result.exit_code == 0 -# test for the add_sage_feature command in cli -def test_add_sage_feature_help(): - runner = CliRunner() - result = runner.invoke(cli, ["sage2feature", "--help"]) - - assert result.exit_code == 0 - - # test for the mzml_statistics command in cli def test_mzml_statistics_help(): runner = CliRunner() @@ -43,42 +35,6 @@ def test_extract_sample_from_expdesign_help(): assert result.exit_code == 0 -def test_sage_feature_file(): - runner = CliRunner() - result = runner.invoke( - cli, - [ - "sage2feature", - "--idx_file", - "tests/test_data/TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01_sage_ms2rescore.idXML", - "--output_file", - "tests/test_data/TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01_sage_ms2rescore_feat_gen.idXML", - "--feat_file", - "tests/test_data/tmt_erwinia_1ulsike_top10hcd_isol2_45stepped_60min_01_sage_ms2rescore.idxml.feature_names.tsv", - ], - ) - - assert result.exit_code == 0 - - -def test_spectrum2fature_file(): - runner = CliRunner() - result = runner.invoke( - cli, - [ - "spectrum2feature", - "--ms_path", - "tests/test_data/TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01.mzML", - "--idxml", - "tests/test_data/TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01_sage_ms2rescore.idXML", - "--output", - "tests/test_data/TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01_sage_ms2rescore_snr.idXML", - ], - ) - - assert result.exit_code == 0 - - def test_diann2mztab_example(): runner = CliRunner() result = runner.invoke( From 04d2af6e752c831138e950e88b2f70f087a19de7 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Thu, 20 Feb 2025 14:57:48 +0000 Subject: [PATCH 7/9] REmove the feature commands -> quantms-rescoring --- recipe/meta.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipe/meta.yaml b/recipe/meta.yaml index f6c4210..bc30f7b 100644 --- a/recipe/meta.yaml +++ b/recipe/meta.yaml @@ -1,7 +1,7 @@ # recipe/meta.yaml package: name: quantms-utils - version: "0.0.18" + version: "0.0.19" source: path: ../ From 09273b91bfef19f706a1e1fba0e74b032c7cf94e Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Thu, 20 Feb 2025 16:12:41 +0000 Subject: [PATCH 8/9] REmove the feature commands -> quantms-rescoring --- quantmsutils/diann/diann2mztab.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/quantmsutils/diann/diann2mztab.py b/quantmsutils/diann/diann2mztab.py index a130eb2..534a993 100644 --- a/quantmsutils/diann/diann2mztab.py +++ b/quantmsutils/diann/diann2mztab.py @@ -77,6 +77,7 @@ def diann2mztab( :type missed_cleavages: int :param qvalue_threshold: Threshold for filtering q value :type qvalue_threshold: float + :param enable_diann2mztab: Enable conversion to mzTab """ logger.debug(f"Revision {REVISION}") logger.debug("Reading input files...") @@ -714,11 +715,6 @@ def mztab_prh(report, pg, index_ref, database, fasta_df, diann_version): f" input index_ref shape: {index_ref.shape}," f" input fasta_df shape: {fasta_df.shape}" ) - # DIA-NN 2.0 PG file doesn't have Protein IDs column - if "2.0" not in diann_version: - file = list(pg.columns[5:]) - else: - file = list(pg.columns[4:]) logger.debug("Classifying results type ...") pg["opt_global_result_type"] = "single_protein" From f72c96f61583b39364a95c70a1c4a1bf5fb04096 Mon Sep 17 00:00:00 2001 From: Yasset Perez-Riverol Date: Thu, 20 Feb 2025 16:21:24 +0000 Subject: [PATCH 9/9] commit unnused variable --- quantmsutils/diann/diann2mztab.py | 1 - 1 file changed, 1 deletion(-) diff --git a/quantmsutils/diann/diann2mztab.py b/quantmsutils/diann/diann2mztab.py index 534a993..290f254 100644 --- a/quantmsutils/diann/diann2mztab.py +++ b/quantmsutils/diann/diann2mztab.py @@ -902,7 +902,6 @@ def mztab_peh( f" len(precursor_list): {len(precursor_list)}," f" index_ref.shape: {index_ref.shape}" ) - out_mztab_peh = pd.DataFrame() out_mztab_peh = pr.iloc[:, 0:10] out_mztab_peh.drop( ["Protein.Ids", "Protein.Names", "First.Protein.Description", "Proteotypic"],