From e43fecf455eb936976a306f7cab52c8620594efa Mon Sep 17 00:00:00 2001
From: Chengxin Dai <37200167+daichengxin@users.noreply.github.com>
Date: Tue, 18 Feb 2025 15:52:50 +0800
Subject: [PATCH 1/9] Support for DIA-NN 2.0

---
 pyproject.toml                    |   2 +-
 quantmsutils/__init__.py          |   2 +-
 quantmsutils/diann/diann2mztab.py | 136 ++++++++++++++++--------------
 3 files changed, 77 insertions(+), 63 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 36452be..4d6d364 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -3,7 +3,7 @@ name = "quantms-utils"
 description = "Python scripts and helpers for the quantMS workflow"
 readme = "README.md"
 license = "MIT"
-version = "0.0.17"
+version = "0.0.19"
 authors = [
     "Yasset Perez-Riverol <ypriverol@gmail.com>",
     "Dai Chengxin <chengxin2024@126.com>",
diff --git a/quantmsutils/__init__.py b/quantmsutils/__init__.py
index f18e5d0..a11f0b4 100644
--- a/quantmsutils/__init__.py
+++ b/quantmsutils/__init__.py
@@ -1 +1 @@
-__version__ = "0.0.18"
+__version__ = "0.0.19"
diff --git a/quantmsutils/diann/diann2mztab.py b/quantmsutils/diann/diann2mztab.py
index a405737..cdb535c 100644
--- a/quantmsutils/diann/diann2mztab.py
+++ b/quantmsutils/diann/diann2mztab.py
@@ -90,7 +90,6 @@ def diann2mztab(
         "Modified.Sequence",
         "Precursor.Charge",
         "Precursor.Quantity",
-        "File.Name",
         "Run",
     ]
 
@@ -101,7 +100,6 @@ def diann2mztab(
         "PeptideSequence",
         "PrecursorCharge",
         "Intensity",
-        "Reference",
         "Run",
     ]
     out_msstats = out_msstats[out_msstats["Intensity"] != 0]
@@ -113,12 +111,6 @@ def diann2mztab(
     out_msstats["FragmentIon"] = "NA"
     out_msstats["ProductCharge"] = "0"
     out_msstats["IsotopeLabelType"] = "L"
-    unique_reference_map = {k: os.path.basename(k) for k in out_msstats["Reference"].unique()}
-    out_msstats["Reference"] = out_msstats["Reference"].map(unique_reference_map)
-    del unique_reference_map
-
-    logger.debug("\n\nReference Column >>>")
-    logger.debug(out_msstats["Reference"])
 
     logger.debug(f"\n\nout_msstats ({out_msstats.shape}) >>>")
     logger.debug(out_msstats.head(5))
@@ -325,7 +317,11 @@ def find_first_file_with_suffix(self, suffix: str) -> os.PathLike:
 
     @property
     def report(self) -> os.PathLike:
-        return self.find_first_file_with_suffix("report.tsv")
+        # DIA-NN 1.8.1 return tsv format, but DIA-NN 2.0 only return parquet
+        try:
+            return self.find_first_file_with_suffix("report.tsv")
+        except FileNotFoundError as e:
+            return self.find_first_file_with_suffix("report.parquet")
 
     @property
     def pg_matrix(self) -> os.PathLike:
@@ -362,7 +358,7 @@ def diann_version(self) -> str:
         return diann_version_id
 
     def validate_diann_version(self) -> None:
-        supported_diann_versions = ["1.8.1", "1.9.beta.1", "1.9.2"]
+        supported_diann_versions = ["1.8.1", "2.0", "2.0.1", "2.0.2"]
         if self.diann_version not in supported_diann_versions:
             raise ValueError(f"Unsupported DIANN version {self.diann_version}")
 
@@ -443,7 +439,6 @@ def convert_to_mztab(
 
     def main_report_df(self, qvalue_threshold: float) -> pd.DataFrame:
         remain_cols = [
-            "File.Name",
             "Run",
             "Protein.Group",
             "Protein.Names",
@@ -465,7 +460,17 @@ def main_report_df(self, qvalue_threshold: float) -> pd.DataFrame:
             "Global.PG.Q.Value",
             "MS2.Scan",
         ]
-        report = pd.read_csv(self.report, sep="\t", header=0, usecols=remain_cols)
+        if ".tsv" in self.report.suffix:
+            report = pd.read_csv(self.report, sep="\t", header=0, usecols=remain_cols)
+        else:
+            report = pd.read_parquet(self.report)
+        # filter decoy
+        if "Decoy" in report.columns:
+            logger.debug(
+                f"Filtering decoy from report: {len(report)} rows"
+            )
+            report = report[report["Decoy"] == 0]
+            logger.debug(f"Report filtered, {len(report)} rows remaining")
 
         # filter based on qvalue parameter for downstream analysiss
         logger.debug(
@@ -1087,60 +1092,69 @@ def __find_info(directory, n):
             ].astype(str)
 
         # TODO seconds returned from precursor.getRT()
-        target.loc[:, "RT"] = target.apply(lambda x: x["RT"] / 60, axis=1)
+        # RT column of DIA-NN 2.0.* is float32 type, but RT column of DIA-NN 1.8.* is float64 type
+        target.loc[:, "RT"] = target.apply(lambda x: (x["RT"] / 60), axis=1)
+        group['RT'] = group['RT'].astype('float64')
 
         rt_matched = pd.merge_asof(group, target, on="RT", direction="nearest")
-        new_target = target
-        new_target.columns = [
-            "scan_RT",
-            "scan_opt_global_spectrum_reference",
-            "MS2.Scan",
-            "scan_exp_mass_to_charge",
-        ]
-        scan_matched = pd.merge(rt_matched, new_target, on="MS2.Scan")
+        # DIA-NN can't export MS2.Scan column in some versions (eg. DIA-NN 2.0, DIA-NN 2.0.1).
+        # So we only match MS2 by RT.
+        if "MS2.Scan" in group.columns:
+            logger.info("Mapping DIA-NN MS2 to mzML by MS2.Scan and RT columns")
+            new_target = target
+            new_target.columns = [
+                "scan_RT",
+                "scan_opt_global_spectrum_reference",
+                "MS2.Scan",
+                "scan_exp_mass_to_charge",
+            ]
+            scan_matched = pd.merge(rt_matched, new_target, on="MS2.Scan")
 
-        #  Cross validation spectrum ID between scan matched and RT matched
-        # Keep Scan matched When RT matched and DIA-NN Scan matched are inconsistent in mzML.
-        scan_matched["unassigned_matched"] = scan_matched.apply(
-            lambda row: 1 if row["MS2.Scan"] != row["DIANN-intraID"] else 0, axis=1
-        )
-        if len(scan_matched[scan_matched["unassigned_matched"] == 1]) > 0:
-            v_str = scan_matched[scan_matched["unassigned_matched"] == 1]["MS2.Scan"].tolist()
-            logger.info(
-                f"RT matched and DIA-NN Scan matched are inconsistent in mzML. Keep Scan matched: {v_str}"
-            )
-            scan_matched.drop(
-                [
-                    "RT",
-                    "opt_global_spectrum_reference",
-                    "DIANN-intraID",
-                    "exp_mass_to_charge",
-                    "unassigned_matched",
-                ],
-                inplace=True,
-                axis=1,
-            )
-            scan_matched.rename(
-                columns={
-                    "scan_RT": "RT",
-                    "scan_opt_global_spectrum_reference": "opt_global_spectrum_reference",
-                    "scan_exp_mass_to_charge": "exp_mass_to_charge",
-                },
-                inplace=True,
+            #  Cross validation spectrum ID between scan matched and RT matched
+            # Keep Scan matched When RT matched and DIA-NN Scan matched are inconsistent in mzML.
+            scan_matched["unassigned_matched"] = scan_matched.apply(
+                lambda row: 1 if row["MS2.Scan"] != row["DIANN-intraID"] else 0, axis=1
             )
+            if len(scan_matched[scan_matched["unassigned_matched"] == 1]) > 0:
+                v_str = scan_matched[scan_matched["unassigned_matched"] == 1]["MS2.Scan"].tolist()
+                logger.info(
+                    f"RT matched and DIA-NN Scan matched are inconsistent in mzML. Keep Scan matched: {v_str}"
+                )
+                scan_matched.drop(
+                    [
+                        "RT",
+                        "opt_global_spectrum_reference",
+                        "DIANN-intraID",
+                        "exp_mass_to_charge",
+                        "unassigned_matched",
+                    ],
+                    inplace=True,
+                    axis=1,
+                )
+                scan_matched.rename(
+                    columns={
+                        "scan_RT": "RT",
+                        "scan_opt_global_spectrum_reference": "opt_global_spectrum_reference",
+                        "scan_exp_mass_to_charge": "exp_mass_to_charge",
+                    },
+                    inplace=True,
+                )
+            else:
+                scan_matched.drop(
+                    [
+                        "scan_RT",
+                        "scan_opt_global_spectrum_reference",
+                        "scan_exp_mass_to_charge",
+                        "unassigned_matched",
+                    ],
+                    inplace=True,
+                    axis=1,
+                )
+
+            out_mztab_psh = pd.concat([out_mztab_psh, scan_matched])
         else:
-            scan_matched.drop(
-                [
-                    "scan_RT",
-                    "scan_opt_global_spectrum_reference",
-                    "scan_exp_mass_to_charge",
-                    "unassigned_matched",
-                ],
-                inplace=True,
-                axis=1,
-            )
-
-        out_mztab_psh = pd.concat([out_mztab_psh, scan_matched])
+            logger.info("MS2.Scan column isn't in DIA-NN report, only Matching MS2 by RT values")
+            out_mztab_psh = pd.concat([out_mztab_psh, rt_matched])
 
     del report
 

From 12dae7270450a96905c7087b7b3d24cd9e843641 Mon Sep 17 00:00:00 2001
From: Chengxin Dai <37200167+daichengxin@users.noreply.github.com>
Date: Tue, 18 Feb 2025 16:04:59 +0800
Subject: [PATCH 2/9] Update diann2mztab.py

---
 quantmsutils/diann/diann2mztab.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/quantmsutils/diann/diann2mztab.py b/quantmsutils/diann/diann2mztab.py
index cdb535c..75e995f 100644
--- a/quantmsutils/diann/diann2mztab.py
+++ b/quantmsutils/diann/diann2mztab.py
@@ -320,7 +320,7 @@ def report(self) -> os.PathLike:
         # DIA-NN 1.8.1 return tsv format, but DIA-NN 2.0 only return parquet
         try:
             return self.find_first_file_with_suffix("report.tsv")
-        except FileNotFoundError as e:
+        except FileNotFoundError:
             return self.find_first_file_with_suffix("report.parquet")
 
     @property

From f0156c979be77200e509938cd7ef5f3388c6f606 Mon Sep 17 00:00:00 2001
From: Chengxin Dai <37200167+daichengxin@users.noreply.github.com>
Date: Wed, 19 Feb 2025 15:03:27 +0800
Subject: [PATCH 3/9] Update diann2mztab.py

---
 quantmsutils/diann/diann2mztab.py | 112 ++++++++++++++++++------------
 1 file changed, 66 insertions(+), 46 deletions(-)

diff --git a/quantmsutils/diann/diann2mztab.py b/quantmsutils/diann/diann2mztab.py
index 75e995f..bf1f180 100644
--- a/quantmsutils/diann/diann2mztab.py
+++ b/quantmsutils/diann/diann2mztab.py
@@ -414,7 +414,7 @@ def convert_to_mztab(
             sep="\t",
             header=0,
         )
-        prh = mztab_prh(report, pg, index_ref, database, fasta_df)
+        prh = mztab_prh(report, pg, index_ref, database, fasta_df, self.diann_version)
         del pg
         pr = pd.read_csv(
             self.pr_matrix,
@@ -443,10 +443,8 @@ def main_report_df(self, qvalue_threshold: float) -> pd.DataFrame:
             "Protein.Group",
             "Protein.Names",
             "Protein.Ids",
-            "First.Protein.Description",
             "PG.MaxLFQ",
             "RT",
-            "MS2.Scan",
             "Global.Q.Value",
             "Lib.Q.Value",
             "PEP",
@@ -457,20 +455,13 @@ def main_report_df(self, qvalue_threshold: float) -> pd.DataFrame:
             "Stripped.Sequence",
             "Precursor.Charge",
             "Precursor.Quantity",
-            "Global.PG.Q.Value",
-            "MS2.Scan",
+            "Global.PG.Q.Value"
         ]
-        if ".tsv" in self.report.suffix:
-            report = pd.read_csv(self.report, sep="\t", header=0, usecols=remain_cols)
+        if "2.0" not in self.diann_version:
+            report = pd.read_csv(self.report, sep="\t", header=0,
+                                 usecols=remain_cols + ["MS2.Scan"])
         else:
-            report = pd.read_parquet(self.report)
-        # filter decoy
-        if "Decoy" in report.columns:
-            logger.debug(
-                f"Filtering decoy from report: {len(report)} rows"
-            )
-            report = report[report["Decoy"] == 0]
-            logger.debug(f"Report filtered, {len(report)} rows remaining")
+            report = pd.read_parquet(self.report, columns=remain_cols + ["Decoy"])
 
         # filter based on qvalue parameter for downstream analysiss
         logger.debug(
@@ -680,7 +671,7 @@ def mztab_mtd(index_ref, dia_params, fasta, charge, missed_cleavages, diann_vers
     return out_mztab_mtd_t, database
 
 
-def mztab_prh(report, pg, index_ref, database, fasta_df):
+def mztab_prh(report, pg, index_ref, database, fasta_df, diann_version):
     """
     Construct PRH sub-table.
 
@@ -694,6 +685,8 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):
     :type database: str
     :param fasta_df: A dataframe contains protein IDs, sequences and lengths
     :type fasta_df: pandas.core.frame.DataFrame
+    :param diann_version: DIA-NN version
+    :type diann_version: str
     :return: PRH sub-table
     :rtype: pandas.core.frame.DataFrame
     """
@@ -704,7 +697,11 @@ def mztab_prh(report, pg, index_ref, database, fasta_df):
         f" input index_ref shape: {index_ref.shape},"
         f" input fasta_df shape: {fasta_df.shape}"
     )
-    file = list(pg.columns[5:])
+    #  DIA-NN 2.0 PG file doesn't have Protein IDs column
+    if "2.0" not in diann_version:
+        file = list(pg.columns[5:])
+    else:
+        file = list(pg.columns[4:])
     col = {}
     for i in file:
         col[i] = (
@@ -966,6 +963,7 @@ def mztab_peh(
     }
     name_mapper = name_mapper_builder(subname_mapper)
     tmp.rename(columns=name_mapper, inplace=True)
+    print(tmp.columns)
     out_mztab_peh = out_mztab_peh.merge(
         tmp.rename(columns={"precursor.Index": "pr_id"}),
         on="pr_id",
@@ -1159,40 +1157,62 @@ def __find_info(directory, n):
     del report
 
     # Score at PSM level: Q.Value
-    out_mztab_psh = out_mztab_psh[
-        [
-            "Stripped.Sequence",
-            "Protein.Group",
-            "Q.Value",
-            "RT",
-            "Precursor.Charge",
-            "Calculate.Precursor.Mz",
-            "exp_mass_to_charge",
-            "Modified.Sequence",
-            "PEP",
-            "Global.Q.Value",
-            "Global.Q.Value",
-            "opt_global_spectrum_reference",
-            "ms_run",
-        ]
-    ]
-    out_mztab_psh.columns = [
-        "sequence",
-        "accession",
-        "search_engine_score[1]",
-        "retention_time",
-        "charge",
-        "calc_mass_to_charge",
+    psm_columns = [
+        "Stripped.Sequence",
+        "Protein.Group",
+        "Q.Value",
+        "RT",
+        "Precursor.Charge",
+        "Calculate.Precursor.Mz",
         "exp_mass_to_charge",
-        "opt_global_cv_MS:1000889_peptidoform_sequence",
-        "opt_global_SpecEValue_score",
-        "opt_global_q-value",
-        "opt_global_q-value_score",
+        "Modified.Sequence",
+        "PEP",
+        "Global.Q.Value",
+        "Global.Q.Value",
         "opt_global_spectrum_reference",
         "ms_run",
     ]
 
-    out_mztab_psh.loc[:, "opt_global_cv_MS:1002217_decoy_peptide"] = "0"
+    if "Decoy" in out_mztab_psh.columns:
+        out_mztab_psh = out_mztab_psh[
+            psm_columns + ["Decoy"]
+        ]
+        out_mztab_psh.columns = [
+            "sequence",
+            "accession",
+            "search_engine_score[1]",
+            "retention_time",
+            "charge",
+            "calc_mass_to_charge",
+            "exp_mass_to_charge",
+            "opt_global_cv_MS:1000889_peptidoform_sequence",
+            "opt_global_SpecEValue_score",
+            "opt_global_q-value",
+            "opt_global_q-value_score",
+            "opt_global_spectrum_reference",
+            "ms_run",
+            "opt_global_cv_MS:1002217_decoy_peptide"
+        ]
+    else:
+        out_mztab_psh = out_mztab_psh[
+            psm_columns
+        ]
+        out_mztab_psh.columns = [
+            "sequence",
+            "accession",
+            "search_engine_score[1]",
+            "retention_time",
+            "charge",
+            "calc_mass_to_charge",
+            "exp_mass_to_charge",
+            "opt_global_cv_MS:1000889_peptidoform_sequence",
+            "opt_global_SpecEValue_score",
+            "opt_global_q-value",
+            "opt_global_q-value_score",
+            "opt_global_spectrum_reference",
+            "ms_run"
+        ]
+        out_mztab_psh.loc[:, "opt_global_cv_MS:1002217_decoy_peptide"] = 0
     out_mztab_psh.loc[:, "PSM_ID"] = out_mztab_psh.index
     out_mztab_psh.loc[:, "unique"] = out_mztab_psh.apply(
         lambda x: "0" if ";" in str(x["accession"]) else "1",

From a8b3ed38f7aa2499195c9022f9415e83c3906051 Mon Sep 17 00:00:00 2001
From: Chengxin Dai <37200167+daichengxin@users.noreply.github.com>
Date: Thu, 20 Feb 2025 22:33:10 +0800
Subject: [PATCH 4/9] fixed error and clean codes

---
 quantmsutils/diann/diann2mztab.py | 302 +++++++++++++-----------------
 1 file changed, 129 insertions(+), 173 deletions(-)

diff --git a/quantmsutils/diann/diann2mztab.py b/quantmsutils/diann/diann2mztab.py
index bf1f180..a130eb2 100644
--- a/quantmsutils/diann/diann2mztab.py
+++ b/quantmsutils/diann/diann2mztab.py
@@ -48,15 +48,15 @@
 @click.option("--enable_diann2mztab", "-e", is_flag=True)
 @click.pass_context
 def diann2mztab(
-    ctx,
-    folder,
-    exp_design,
-    dia_params,
-    diann_version,
-    charge,
-    missed_cleavages,
-    qvalue_threshold,
-    enable_diann2mztab
+        ctx,
+        folder,
+        exp_design,
+        dia_params,
+        diann_version,
+        charge,
+        missed_cleavages,
+        qvalue_threshold,
+        enable_diann2mztab
 ):
     """
     Convert DIA-NN output to MSstats, Triqler or mzTab.
@@ -94,7 +94,12 @@ def diann2mztab(
     ]
 
     logger.debug("Converting to MSstats format...")
-    out_msstats = report[msstats_columns_keep]
+    # Filter Decoy precursors for MSstats analysis, the decoy PSMs are only store in mzTab
+    if "Decoy" in report.columns:
+        out_msstats = report[report["Decoy"] != 1]
+        out_msstats = out_msstats[msstats_columns_keep]
+    else:
+        out_msstats = report[msstats_columns_keep]
     out_msstats.columns = [
         "ProteinName",
         "PeptideSequence",
@@ -104,9 +109,11 @@ def diann2mztab(
     ]
     out_msstats = out_msstats[out_msstats["Intensity"] != 0]
 
-    # Q: What is this line doing?
+    # Convert Unimod ID to modification name and replace ^ with DECOY_
     out_msstats.loc[:, "PeptideSequence"] = out_msstats.apply(
-        lambda x: AASequence.fromString(x["PeptideSequence"]).toString(), axis=1
+        lambda x: AASequence.fromString(x["PeptideSequence"]).toString() if "^" not in x["PeptideSequence"] else
+        "^" + AASequence.fromString(x["PeptideSequence"].replace("^", "")).toString(),
+        axis=1
     )
     out_msstats["FragmentIon"] = "NA"
     out_msstats["ProductCharge"] = "0"
@@ -222,7 +229,7 @@ def get_exp_design_dfs(exp_design_file):
         f_table = pd.DataFrame(f_table, columns=f_header)
         f_table.loc[:, "run"] = f_table.apply(lambda x: _true_stem(x["Spectra_Filepath"]), axis=1)
 
-        s_table = [i.replace("\n", "").split("\t") for i in data[empty_row + 1 :]][1:]
+        s_table = [i.replace("\n", "").split("\t") for i in data[empty_row + 1:]][1:]
         s_header = data[empty_row + 1].replace("\n", "").split("\t")
         s_data_frame = pd.DataFrame(s_table, columns=s_header)
 
@@ -250,6 +257,9 @@ def compute_mass_modified_peptide(peptide_seq: str) -> float:
         "O": "X[237.14773053312]",  # 255.158295 - 17.003288 - 1.00727646688
     }
     for aa in peptide_seq:
+        # DIA-NN Decoy tag
+        if aa == "^":
+            continue
         # Check if the letter is in aminoacid
         if aa == "(":
             not_mod = False
@@ -259,35 +269,36 @@ def compute_mass_modified_peptide(peptide_seq: str) -> float:
         if aa in aa_mass and not_mod:
             aa = aa_mass[aa]
         elif (
-            aa
-            not in [
-                "G",
-                "A",
-                "V",
-                "L",
-                "I",
-                "F",
-                "M",
-                "P",
-                "W",
-                "S",
-                "C",
-                "T",
-                "Y",
-                "N",
-                "Q",
-                "D",
-                "E",
-                "K",
-                "R",
-                "H",
-            ]
-            and not_mod
-            and aa != ")"
+                aa
+                not in [
+                    "G",
+                    "A",
+                    "V",
+                    "L",
+                    "I",
+                    "F",
+                    "M",
+                    "P",
+                    "W",
+                    "S",
+                    "C",
+                    "T",
+                    "Y",
+                    "N",
+                    "Q",
+                    "D",
+                    "E",
+                    "K",
+                    "R",
+                    "H",
+                ]
+                and not_mod
+                and aa != ")"
         ):
             logger.info(f"Unknown amino acid with mass not known:{aa}")
         peptide_parts.append(aa)
     new_peptide_seq = "".join(peptide_parts)
+    # DIA-NN maybe output "^", see https://github.com/vdemichev/DiaNN/issues/1404
     mass = AASequence.fromString(new_peptide_seq).getMonoWeight()
     logger.debug(new_peptide_seq + ":" + str(mass))
     return mass
@@ -363,13 +374,13 @@ def validate_diann_version(self) -> None:
             raise ValueError(f"Unsupported DIANN version {self.diann_version}")
 
     def convert_to_mztab(
-        self,
-        report,
-        f_table,
-        charge: int,
-        missed_cleavages: int,
-        dia_params: List[Any],
-        out: Union[os.PathLike, str],
+            self,
+            report,
+            f_table,
+            charge: int,
+            missed_cleavages: int,
+            dia_params: List[Any],
+            out: Union[os.PathLike, str],
     ) -> None:
         logger.info("Converting to mzTab")
         self.validate_diann_version()
@@ -392,21 +403,21 @@ def convert_to_mztab(
         index_ref = f_table.copy()
         index_ref.rename(
             columns={
-                "Fraction_Group": "ms_run",
+                "Fraction_Group": "assay",
                 "Sample": "study_variable",
                 "run": "Run",
             },
             inplace=True,
         )
-        index_ref["ms_run"] = index_ref["ms_run"].astype("int")
+        index_ref["assay"] = index_ref["assay"].astype("int")
         index_ref["study_variable"] = index_ref["study_variable"].astype("int")
         report = report.merge(
-            index_ref[["ms_run", "Run", "study_variable"]],
+            index_ref[["assay", "Run", "study_variable"]],
             on="Run",
             validate="many_to_one",
         )
 
-        mtd, database = mztab_mtd(
+        mtd, database, ms_run_index = mztab_mtd(
             index_ref, dia_params, str(self.fasta), charge, missed_cleavages, self.diann_version
         )
         pg = pd.read_csv(
@@ -422,9 +433,9 @@ def convert_to_mztab(
             header=0,
         )
         precursor_list = list(report["Precursor.Id"].unique())
-        peh = mztab_peh(report, pr, precursor_list, index_ref, database)
+        peh = mztab_peh(report, pr, precursor_list, index_ref, database, ms_run_index)
         del pr
-        psh = mztab_psh(report, str(self.base_path), database)
+        psh = mztab_psh(report, str(self.base_path), database, ms_run_index)
         del report
         mtd.loc["", :] = ""
         prh.loc[len(prh) + 1, :] = ""
@@ -478,8 +489,8 @@ def main_report_df(self, qvalue_threshold: float) -> pd.DataFrame:
         }
         mass_vector = report["Modified.Sequence"].map(uniq_masses)
         report["Calculate.Precursor.Mz"] = (
-            mass_vector + (PROTON_MASS_U * report["Precursor.Charge"])
-        ) / report["Precursor.Charge"]
+                                                   mass_vector + (PROTON_MASS_U * report["Precursor.Charge"])
+                                           ) / report["Precursor.Charge"]
 
         logger.debug("Indexing Precursors")
         # Making the map is 1500x faster
@@ -580,16 +591,16 @@ def mztab_mtd(index_ref, dia_params, fasta, charge, missed_cleavages, diann_vers
     out_mztab_mtd.loc[1, "software[1]-setting[1]"] = fasta
     out_mztab_mtd.loc[1, "software[1]-setting[2]"] = "db_version:null"
     out_mztab_mtd.loc[1, "software[1]-setting[3]"] = (
-        "fragment_mass_tolerance:" + fragment_mass_tolerance
+            "fragment_mass_tolerance:" + fragment_mass_tolerance
     )
     out_mztab_mtd.loc[1, "software[1]-setting[4]"] = (
-        "fragment_mass_tolerance_unit:" + fragment_mass_tolerance_unit
+            "fragment_mass_tolerance_unit:" + fragment_mass_tolerance_unit
     )
     out_mztab_mtd.loc[1, "software[1]-setting[5]"] = (
-        "precursor_mass_tolerance:" + precursor_mass_tolerance
+            "precursor_mass_tolerance:" + precursor_mass_tolerance
     )
     out_mztab_mtd.loc[1, "software[1]-setting[6]"] = (
-        "precursor_mass_tolerance_unit:" + precursor_mass_tolerance_unit
+            "precursor_mass_tolerance_unit:" + precursor_mass_tolerance_unit
     )
     out_mztab_mtd.loc[1, "software[1]-setting[7]"] = "enzyme:" + enzyme
     out_mztab_mtd.loc[1, "software[1]-setting[8]"] = "enzyme_term_specificity:full"
@@ -625,29 +636,35 @@ def mztab_mtd(index_ref, dia_params, fasta, charge, missed_cleavages, diann_vers
     out_mztab_mtd.loc[1, "protein-quantification_unit"] = "[, , Abundance, ]"
     out_mztab_mtd.loc[1, "peptide-quantification_unit"] = "[, , Abundance, ]"
 
-    for i in range(1, max(index_ref["ms_run"]) + 1):
+    # Construct ms run index
+    ms_run_index = {}
+    for i in range(1, len(index_ref["Run"]) + 1):
         out_mztab_mtd.loc[1, "ms_run[" + str(i) + "]-format"] = "[MS, MS:1000584, mzML file, ]"
         out_mztab_mtd.loc[1, "ms_run[" + str(i) + "]-location"] = (
-            "file://" + index_ref[index_ref["ms_run"] == i]["Spectra_Filepath"].values[0]
+                "file://" + index_ref.loc[i - 1, "Spectra_Filepath"]
         )
         out_mztab_mtd.loc[1, "ms_run[" + str(i) + "]-id_format"] = (
             "[MS, MS:1000777, spectrum identifier nativeID format, ]"
         )
+        ms_run_index[index_ref.loc[i - 1, "Run"]] = i
+
+    # Construct assay index by sample
+    for i in range(1, max(index_ref["study_variable"]) + 1):
+        assays_run = []
+        for j in list(index_ref[index_ref["study_variable"] == i].index):
+            assays_run.append("ms_run[" + str(j + 1) + "]")
+
         out_mztab_mtd.loc[1, "assay[" + str(i) + "]-quantification_reagent"] = (
             "[MS, MS:1002038, unlabeled sample, ]"
         )
-        out_mztab_mtd.loc[1, "assay[" + str(i) + "]-ms_run_ref"] = "ms_run[" + str(i) + "]"
+        out_mztab_mtd.loc[1, "assay[" + str(i) + "]-ms_run_ref"] = ",".join(assays_run)
 
+    # Construct assay index by sample
     with warnings.catch_warnings():
         warnings.simplefilter("ignore")
         # This is used here in order to ignore performance warnings from pandas.
         for i in range(1, max(index_ref["study_variable"]) + 1):
-            study_variable = []
-            for j in list(index_ref[index_ref["study_variable"] == i]["ms_run"].values):
-                study_variable.append("assay[" + str(j) + "]")
-            out_mztab_mtd.loc[1, "study_variable[" + str(i) + "]-assay_refs"] = ",".join(
-                study_variable
-            )
+            out_mztab_mtd.loc[1, "study_variable[" + str(i) + "]-assay_refs"] = "assay[" + str(i) + "]"
             out_mztab_mtd.loc[1, "study_variable[" + str(i) + "]-description"] = (
                 "no description given"
             )
@@ -668,7 +685,7 @@ def mztab_mtd(index_ref, dia_params, fasta, charge, missed_cleavages, diann_vers
     out_mztab_mtd_t.insert(0, "index", index)
     database = os.path.basename(fasta.split(".")[-2])
 
-    return out_mztab_mtd_t, database
+    return out_mztab_mtd_t, database, ms_run_index
 
 
 def mztab_prh(report, pg, index_ref, database, fasta_df, diann_version):
@@ -702,15 +719,6 @@ def mztab_prh(report, pg, index_ref, database, fasta_df, diann_version):
         file = list(pg.columns[5:])
     else:
         file = list(pg.columns[4:])
-    col = {}
-    for i in file:
-        col[i] = (
-            "protein_abundance_assay["
-            + str(index_ref[index_ref["Run"] == _true_stem(i)]["ms_run"].values[0])
-            + "]"
-        )
-
-    pg.rename(columns=col, inplace=True)
 
     logger.debug("Classifying results type ...")
     pg["opt_global_result_type"] = "single_protein"
@@ -748,7 +756,7 @@ def mztab_prh(report, pg, index_ref, database, fasta_df, diann_version):
 
     protein_details_df = out_mztab_prh[
         out_mztab_prh["opt_global_result_type"] == "indistinguishable_protein_group"
-    ]
+        ]
     prh_series = (
         protein_details_df["Protein.Group"]
         .str.split(";", expand=True)
@@ -765,12 +773,20 @@ def mztab_prh(report, pg, index_ref, database, fasta_df, diann_version):
     if len(protein_details_df) > 0:
         logger.info(f"Found {len(protein_details_df)} indistinguishable protein groups")
         # The Following line fails if there are no indistinguishable protein groups
-        protein_details_df.loc[:, "col"] = "protein_details"
+        protein_details_df.loc[:, "opt_global_result_type"] = "protein_details"
         # protein_details_df = protein_details_df[-protein_details_df["accession"].str.contains("-")]
-        out_mztab_prh = pd.concat([out_mztab_prh, protein_details_df]).reset_index(drop=True)
+        out_mztab_prh = pd.concat([out_mztab_prh, protein_details_df], ignore_index=True).reset_index(drop=True)
     else:
         logger.info("No indistinguishable protein groups found")
 
+    # Rename file name to protein abundance, and calculate mean of ms runs from same sample
+    for i in range(1, max(index_ref["study_variable"]) + 1):
+        assays_run = []
+        for j in list(index_ref[index_ref["study_variable"] == i].index):
+            assays_run.append(os.path.basename(index_ref.loc[j, "Spectra_Filepath"]))
+        out_mztab_prh["protein_abundance_assay[" + str(i) + "]"] = out_mztab_prh[assays_run].mean(axis=1)
+        out_mztab_prh.drop(assays_run, inplace=True, axis=1)
+
     logger.debug("Calculating protein coverage (bottleneck)...")
     # This is a bottleneck
     # reimplementation runs in 67s vs 137s (old) in my data
@@ -858,11 +874,12 @@ def mztab_prh(report, pg, index_ref, database, fasta_df, diann_version):
 
 
 def mztab_peh(
-    report: pd.DataFrame,
-    pr: pd.DataFrame,
-    precursor_list: List[str],
-    index_ref: pd.DataFrame,
-    database: os.PathLike,
+        report: pd.DataFrame,
+        pr: pd.DataFrame,
+        precursor_list: List[str],
+        index_ref: pd.DataFrame,
+        database: os.PathLike,
+        ms_run_index: Dict
 ) -> pd.DataFrame:
     """
     Construct PEH sub-table.
@@ -877,6 +894,8 @@ def mztab_peh(
     :type index_ref: pandas.core.frame.DataFrame
     :param database: Path to fasta file
     :type database: str
+    :param ms_run_index: Dict for MS run index
+    :type ms_run_index: dict
     :return: PEH sub-table
     :rtype: pandas.core.frame.DataFrame
     """
@@ -935,7 +954,6 @@ def mztab_peh(
     ]
     for i in null_col:
         out_mztab_peh.loc[:, i] = "null"
-    out_mztab_peh.loc[:, "opt_global_cv_MS:1002217_decoy_peptide"] = "0"
 
     logger.debug("Matching precursor IDs...")
     # Pre-calculating the indices and using a lookup table drops run time from
@@ -948,14 +966,18 @@ def mztab_peh(
     logger.debug("Getting scores per run")
     # This implementation is 422-700x faster than the apply-based one
     tmp = (
-        report.groupby(["precursor.Index", "ms_run"])
+        report.groupby(["precursor.Index", "Run"])
         .agg({"Q.Value": ["min"]})
         .reset_index()
-        .pivot(columns=["ms_run"], index="precursor.Index")
+        .pivot(columns=["Run"], index="precursor.Index")
         .reset_index()
     )
+
     tmp.columns = pd.Index(
-        ["::".join([str(s) for s in col]).strip() for col in tmp.columns.values]
+        ["::".join([str(s) if s in ["precursor.Index",
+                                    "Q.Value",
+                                    "min",
+                                    ""] else str(ms_run_index[s]) for s in col]).strip() for col in tmp.columns.values]
     )
     subname_mapper = {
         "precursor.Index::::": "precursor.Index",
@@ -963,7 +985,6 @@ def mztab_peh(
     }
     name_mapper = name_mapper_builder(subname_mapper)
     tmp.rename(columns=name_mapper, inplace=True)
-    print(tmp.columns)
     out_mztab_peh = out_mztab_peh.merge(
         tmp.rename(columns={"precursor.Index": "pr_id"}),
         on="pr_id",
@@ -1019,6 +1040,14 @@ def mztab_peh(
     out_mztab_peh.loc[:, "PEH"] = "PEP"
     out_mztab_peh.loc[:, "database"] = str(database)
     index = out_mztab_peh.loc[:, "PEH"]
+    # Merge Decoy, I think Decoy is unique for Precursor.
+    if "Decoy" in report.columns:
+        out_mztab_peh = pd.merge(out_mztab_peh, report[["Precursor.Id", "Decoy"]].drop_duplicates(),
+                                 on="Precursor.Id", how="inner")
+        out_mztab_peh.rename(columns={"Decoy": "opt_global_cv_MS:1002217_decoy_peptide"}, inplace=True)
+    else:
+        out_mztab_peh.loc[:, "opt_global_cv_MS:1002217_decoy_peptide"] = "0"
+
     out_mztab_peh.drop(["PEH", "Precursor.Id", "Genes", "pr_id"], axis=1, inplace=True)
     out_mztab_peh.insert(0, "PEH", index)
     out_mztab_peh.fillna("null", inplace=True)
@@ -1030,7 +1059,7 @@ def mztab_peh(
     return out_mztab_peh
 
 
-def mztab_psh(report, folder, database):
+def mztab_psh(report, folder, database, ms_run_index):
     """
     Construct PSH sub-table.
 
@@ -1042,6 +1071,8 @@ def mztab_psh(report, folder, database):
     :type folder: str
     :param database: Path to fasta file
     :type database: str
+    :param ms_run_index: Dict for MS run index
+    :type ms_run_index: dict
     :return: PSH sub-table
     :rtype: pandas.core.frame.DataFrame
     """
@@ -1069,7 +1100,6 @@ def __find_info(directory, n):
 
         file = __find_info(folder, n)
         target = pd.read_parquet(file)
-
         # Read original parquet columns from mzml_stats
         target = target[target[MS_LEVEL] == 2]
         target.reset_index(inplace=True, drop=True)
@@ -1086,8 +1116,8 @@ def __find_info(directory, n):
         # Standardize spectrum identifier format for bruker data
         if not isinstance(target.loc[0, "opt_global_spectrum_reference"], str):
             target.loc[:, "opt_global_spectrum_reference"] = "scan=" + target.loc[
-                :, "opt_global_spectrum_reference"
-            ].astype(str)
+                                                                       :, "opt_global_spectrum_reference"
+                                                                       ].astype(str)
 
         # TODO seconds returned from precursor.getRT()
         # RT column of DIA-NN 2.0.* is float32 type, but RT column of DIA-NN 1.8.* is float64 type
@@ -1170,13 +1200,13 @@ def __find_info(directory, n):
         "Global.Q.Value",
         "Global.Q.Value",
         "opt_global_spectrum_reference",
-        "ms_run",
+        "Run",
     ]
 
     if "Decoy" in out_mztab_psh.columns:
         out_mztab_psh = out_mztab_psh[
             psm_columns + ["Decoy"]
-        ]
+            ]
         out_mztab_psh.columns = [
             "sequence",
             "accession",
@@ -1242,14 +1272,14 @@ def __find_info(directory, n):
     )
 
     out_mztab_psh.loc[:, "spectra_ref"] = out_mztab_psh.apply(
-        lambda x: "ms_run[{}]:".format(x["ms_run"]) + x["opt_global_spectrum_reference"],
+        lambda x: "ms_run[{}]:".format(ms_run_index[x["ms_run"]]) + x["opt_global_spectrum_reference"],
         axis=1,
         result_type="expand",
     )
 
     out_mztab_psh.loc[:, "opt_global_cv_MS:1000889_peptidoform_sequence"] = out_mztab_psh.apply(
         lambda x: AASequence.fromString(
-            x["opt_global_cv_MS:1000889_peptidoform_sequence"]
+            x["opt_global_cv_MS:1000889_peptidoform_sequence"].replace("^", "")
         ).toString(),
         axis=1,
         result_type="expand",
@@ -1269,24 +1299,6 @@ def __find_info(directory, n):
     return out_mztab_psh
 
 
-def add_info(target, index_ref):
-    """
-    On the basis of f_table, two columns "ms_run" and "study_variable" are added for matching.
-
-    :param target: The value of "Run" column in f_table
-    :type target: str
-    :param index_ref: A dataframe on the basis of f_table
-    :type index_ref: pandas.core.frame.DataFrame
-    :return: A tuple contains ms_run and study_variable
-    :rtype: tuple
-    """
-    match = index_ref[index_ref["Run"] == target]
-    ms_run = match["ms_run"].values[0]
-    study_variable = match["study_variable"].values[0]
-
-    return ms_run, study_variable
-
-
 def classify_result_type(target):
     """Classify proteins
 
@@ -1300,62 +1312,6 @@ def classify_result_type(target):
     return "single_protein"
 
 
-def match_in_report(report, target, max_, flag, level):
-    """
-    This function is used to match the columns "ms_run" and "study_variable" from the report and
-    get the corresponding information for the mztab ms_run and study_values metadata values.
-
-    :param report: Dataframe for Dia-NN main report
-    :type report: pandas.core.frame.DataFrame
-    :param target: The value of "pr_id" column in out_mztab_PEH(level="peptide") or the "accession" column in out_mztab_PRH(level="protein")
-    :type target: str
-    :param max_: max_assay or max_study_variable
-    :type max_: int
-    :param flag: Match the "study_variable" column(flag=1) or the "ms_run" column(flag=0) in the filter result
-    :type flag: int
-    :param level: "pep" or "protein"
-    :type level: str
-    :return: A tuple contains multiple messages
-    :rtype: tuple
-    """  # noqa
-    if flag == 1 and level == "pep":
-        result = report[report["precursor.Index"] == target]
-        peh_params = []
-        for i in range(1, max_ + 1):
-            match = result[result["study_variable"] == i]
-            peh_params.extend(
-                [
-                    match["Precursor.Normalised"].mean(),
-                    "null",
-                    "null",
-                    "null",
-                    match["RT.Start"].mean(),
-                ]
-            )
-
-        return tuple(peh_params)
-
-    if flag == 0 and level == "pep":
-        result = report[report["precursor.Index"] == target]
-        q_value = []
-        for i in range(1, max_ + 1):
-            match = result[result["ms_run"] == i]
-            q_value.append(
-                match["Q.Value"].values[0] if match["Q.Value"].values.size > 0 else np.nan
-            )
-
-        return tuple(q_value)
-
-    if flag == 1 and level == "protein":
-        result = report[report["Protein.Group"] == target]
-        prh_params = []
-        for i in range(1, max_ + 1):
-            match = result[result["study_variable"] == i]
-            prh_params.extend([match["PG.MaxLFQ"].mean(), "null", "null"])
-
-        return tuple(prh_params)
-
-
 class ModScoreLooker:
     """
     Class used to cache the lookup table of accessions to best scores and their
@@ -1605,7 +1561,7 @@ def calculate_coverage(ref_sequence: str, sequences: Set[str]):
     for start, length in sorted(zip(starts, lengths)):
         if merged_starts and merged_starts[-1] + merged_lengths[-1] >= start:
             merged_lengths[-1] = (
-                max(merged_starts[-1] + merged_lengths[-1], start + length) - merged_starts[-1]
+                    max(merged_starts[-1] + merged_lengths[-1], start + length) - merged_starts[-1]
             )
         else:
             merged_starts.append(start)
@@ -1617,7 +1573,7 @@ def calculate_coverage(ref_sequence: str, sequences: Set[str]):
 
 
 def calculate_protein_coverages(
-    report: pd.DataFrame, out_mztab_prh: pd.DataFrame, fasta_df: pd.DataFrame
+        report: pd.DataFrame, out_mztab_prh: pd.DataFrame, fasta_df: pd.DataFrame
 ) -> List[str]:
     """Calculates protein coverages for the PRH table.
 

From d2a9783f8621a6dbf24413e9c84851e29a63c546 Mon Sep 17 00:00:00 2001
From: Yasset Perez-Riverol <ypriverol@gmail.com>
Date: Thu, 20 Feb 2025 14:53:58 +0000
Subject: [PATCH 5/9] REmove the feature commands -> quantms-rescoring

---
 quantmsutils/features/__init__.py     |   0
 quantmsutils/features/sage_feature.py |  38 ---------
 quantmsutils/features/snr.py          | 113 --------------------------
 3 files changed, 151 deletions(-)
 delete mode 100644 quantmsutils/features/__init__.py
 delete mode 100644 quantmsutils/features/sage_feature.py
 delete mode 100644 quantmsutils/features/snr.py

diff --git a/quantmsutils/features/__init__.py b/quantmsutils/features/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/quantmsutils/features/sage_feature.py b/quantmsutils/features/sage_feature.py
deleted file mode 100644
index 14c9227..0000000
--- a/quantmsutils/features/sage_feature.py
+++ /dev/null
@@ -1,38 +0,0 @@
-import click
-import pandas as pd
-import pyopenms as oms
-
-
-@click.command("sage2feature")
-@click.option("--idx_file", "-i", help="Input idXML file")
-@click.option("--output_file", "-o", help="Output idXML file")
-@click.option("--feat_file", "-f", help="Input feature table file")
-@click.pass_context
-def add_sage_feature(ctx, idx_file: str, output_file: str, feat_file: str):
-    """
-    Add extra features in features idXML. Adding extra feature in Sage isn't known input for PSMFeatureExtractor
-
-    :param ctx: click context
-    :param idx_file: Original idXML file
-    :param output_file: Outpuf file with the extra feature
-    :param feat_file: Feature file from Sage
-    :return: None
-    """
-    extra_feat = []
-    feat = pd.read_csv(feat_file, sep="\t")
-    for _, row in feat.iterrows():
-        if row["feature_generator"] == "psm_file":
-            continue
-        else:
-            extra_feat.append(row["feature_name"])
-    print("Adding extra feature: {}".format(extra_feat))
-    protein_ids = []
-    peptide_ids = []
-    oms.IdXMLFile().load(idx_file, protein_ids, peptide_ids)
-    search_parameters = protein_ids[0].getSearchParameters()
-    features = search_parameters.getMetaValue("extra_features")
-    extra_features = features + "," + ",".join(extra_feat)
-    search_parameters.setMetaValue("extra_features", extra_features)
-    protein_ids[0].setSearchParameters(search_parameters)
-    oms.IdXMLFile().store(output_file, protein_ids, peptide_ids)
-    print("Done")
diff --git a/quantmsutils/features/snr.py b/quantmsutils/features/snr.py
deleted file mode 100644
index c0787b1..0000000
--- a/quantmsutils/features/snr.py
+++ /dev/null
@@ -1,113 +0,0 @@
-import re
-
-import click
-import numpy as np
-from pyopenms import MzMLFile, MSExperiment, SpectrumLookup
-import pyopenms as oms
-from scipy.stats import entropy
-
-
-def compute_signal_to_noise(intensities):
-    """
-    Compute the signal-to-noise ratio for a given spectrum
-    :param intensities: intensity values
-    :return:
-    """
-    rmsd = np.sqrt(np.mean(np.square(intensities)))
-
-    # Calculate SNR
-    snr = np.max(intensities) / rmsd
-
-    return snr
-
-
-@click.command("spectrum2feature")
-@click.option("--ms_path", type=click.Path(exists=True), required=True)
-@click.option(
-    "--idxml",
-    help="The idxml file with the PSMs corresponding to the mzML file",
-    required=True,
-)
-@click.option(
-    "--output",
-    help="The output idXML file with the signal-to-noise ratio",
-    required=True,
-)
-@click.pass_context
-def spectrum2feature(ctx, ms_path: str, idxml: str, output: str) -> None:
-    """
-    The snr function computes the signal-to-noise ratio for a given mass spectrometry file and its corresponding
-    identification file.
-
-    :param ctx: Click context
-    :param ms_path: A string specifying the path to the mass spectrometry file.
-    :param idxml: A string specifying the path to the identification file.
-    :param output: A string specifying the path to the output file.
-    """
-
-    mzml_file = MSExperiment()
-    MzMLFile().load(ms_path, mzml_file)
-
-    lookup = SpectrumLookup()
-    lookup.readSpectra(mzml_file, "scan=(?<SCAN>\\d+)")
-
-    protein_ids = []
-    peptide_ids = []
-    oms.IdXMLFile().load(idxml, protein_ids, peptide_ids)
-
-    result_peptides = []
-    for peptide in peptide_ids:
-        spectrum_reference = peptide.getMetaValue("spectrum_reference")
-        scan_number = int(re.findall(r"(spectrum|scan)=(\d+)", spectrum_reference)[0][1])
-
-        try:
-            index = lookup.findByScanNumber(scan_number)
-            spectrum = mzml_file.getSpectrum(index)
-            intensity_array = spectrum.get_peaks()[1]
-            mz_array = spectrum.get_peaks()[0]
-
-            ## Compute signal to noise ratio
-            snr = compute_signal_to_noise(intensity_array)
-
-            # Spectral Entropy
-            tic = np.sum(intensity_array)
-            normalized_intensities = intensity_array / tic
-            spectral_entropy = entropy(normalized_intensities)
-
-            # Fraction of TIC in Top 10 Peaks
-            top_n_peaks = np.sort(intensity_array)[-10:]
-            fraction_tic_top_10 = np.sum(top_n_peaks) / tic
-
-            # Intensity Weighted m/z Standard Deviation
-            weighted_mean_mz = np.sum(np.array(mz_array) * normalized_intensities)
-            weighted_std_mz = np.sqrt(
-                np.sum(normalized_intensities * (np.array(mz_array) - weighted_mean_mz) ** 2)
-            )
-
-            for hit in peptide.getHits():
-                hit.setMetaValue("quantms:SNR", str(snr))
-                hit.setMetaValue("quantms:SpectralEntropy", str(spectral_entropy))
-                hit.setMetaValue("quantms:FracTICinTop10Peaks", str(fraction_tic_top_10))
-                hit.setMetaValue("quantms:WeightedStdMz", str(weighted_std_mz))
-
-                # update hit in peptidehits list
-                peptide.setHits([hit])
-            ## update peptide in peptide_ids list
-            result_peptides.append(peptide)
-
-        except IndexError:
-            message = "scan_number" + str(scan_number) + "not found in file: " + ms_path
-            print(message)
-
-    # Add quantms:SNR as a feature
-    search_parameters = protein_ids[0].getSearchParameters()
-    features = search_parameters.getMetaValue("extra_features")
-    extra_features = (
-        features
-        + ",quantms:SNR,quantms:SpectralEntropy,quantms:FracTICinTop10Peaks,quantms:WeightedStdMz"
-    )
-    search_parameters.setMetaValue("extra_features", extra_features)
-    protein_ids[0].setSearchParameters(search_parameters)
-
-    oms.IdXMLFile().store(output, protein_ids, peptide_ids)
-    print("The output file with the signal-to-noise ratio has been saved at: ", output)

From e85331c8f6f781b9399b2df60ba45a69c89566a1 Mon Sep 17 00:00:00 2001
From: Yasset Perez-Riverol <ypriverol@gmail.com>
Date: Thu, 20 Feb 2025 14:56:38 +0000
Subject: [PATCH 6/9] REmove the feature commands -> quantms-rescoring

---
 README.md                     |  5 ----
 quantmsutils/quantmsutilsc.py |  4 ----
 tests/test_commands.py        | 44 -----------------------------------
 3 files changed, 53 deletions(-)

diff --git a/README.md b/README.md
index d9faa94..4f848e9 100644
--- a/README.md
+++ b/README.md
@@ -27,11 +27,6 @@ The following functionalities are available in the package:
 - `openms2sample` - Extra sample information from OpenMS experimental design file. An example of OpenMS experimental design file is available [here](https://github.com/bigbio/quantms-utils/blob/dev/tests/test_data/BSA_design_urls.tsv).
 - `checksamplesheet` - Check the sample sheet for errors and inconsistencies. The experimental design coult be an OpenMS experimental design file or and SDRF file. 
 
-### Features to percolator scripts
-
-- `sage2feature` - The add_sage_feature function enhances an idXML file by appending additional features from a Sage feature table, excluding those generated by 'psm_file'.
-- `spectrum2feature`  - Add the signal-to-noise ratio (SNR) to the feature table for percolator.
-
 ### Other scripts
 
 - `psmconvert` - The convert_psm function converts peptide spectrum matches (PSMs) from an idXML file to a CSV file, optionally filtering out decoy matches. It extracts and processes data from both the idXML and an associated spectra file, handling multiple search engines and scoring systems.
diff --git a/quantmsutils/quantmsutilsc.py b/quantmsutils/quantmsutilsc.py
index d258a2e..815180f 100644
--- a/quantmsutils/quantmsutilsc.py
+++ b/quantmsutils/quantmsutilsc.py
@@ -2,12 +2,10 @@
 
 from quantmsutils.diann.diann2mztab import diann2mztab
 from quantmsutils.diann.dianncfg import dianncfg
-from quantmsutils.features.sage_feature import add_sage_feature
 from quantmsutils.mzml.mzml_statistics import mzml_statistics
 from quantmsutils.psm.psm_conversion import convert_psm
 from quantmsutils.sdrf.check_samplesheet import checksamplesheet
 from quantmsutils.sdrf.extract_sample import extract_sample_from_expdesign
-from quantmsutils.features.snr import spectrum2feature
 from quantmsutils import __version__
 
 CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])
@@ -23,12 +21,10 @@ def cli():
 
 cli.add_command(dianncfg)
 cli.add_command(diann2mztab)
-cli.add_command(add_sage_feature)
 cli.add_command(mzml_statistics)
 cli.add_command(extract_sample_from_expdesign)
 cli.add_command(checksamplesheet)
 cli.add_command(convert_psm)
-cli.add_command(spectrum2feature)
 
 
 def main():
diff --git a/tests/test_commands.py b/tests/test_commands.py
index d49032f..b648fc3 100644
--- a/tests/test_commands.py
+++ b/tests/test_commands.py
@@ -11,14 +11,6 @@ def test_create_diann_cfg_help():
     assert result.exit_code == 0
 
 
-# test for the add_sage_feature command in cli
-def test_add_sage_feature_help():
-    runner = CliRunner()
-    result = runner.invoke(cli, ["sage2feature", "--help"])
-
-    assert result.exit_code == 0
-
-
 # test for the mzml_statistics command in cli
 def test_mzml_statistics_help():
     runner = CliRunner()
@@ -43,42 +35,6 @@ def test_extract_sample_from_expdesign_help():
     assert result.exit_code == 0
 
 
-def test_sage_feature_file():
-    runner = CliRunner()
-    result = runner.invoke(
-        cli,
-        [
-            "sage2feature",
-            "--idx_file",
-            "tests/test_data/TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01_sage_ms2rescore.idXML",
-            "--output_file",
-            "tests/test_data/TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01_sage_ms2rescore_feat_gen.idXML",
-            "--feat_file",
-            "tests/test_data/tmt_erwinia_1ulsike_top10hcd_isol2_45stepped_60min_01_sage_ms2rescore.idxml.feature_names.tsv",
-        ],
-    )
-
-    assert result.exit_code == 0
-
-
-def test_spectrum2fature_file():
-    runner = CliRunner()
-    result = runner.invoke(
-        cli,
-        [
-            "spectrum2feature",
-            "--ms_path",
-            "tests/test_data/TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01.mzML",
-            "--idxml",
-            "tests/test_data/TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01_sage_ms2rescore.idXML",
-            "--output",
-            "tests/test_data/TMT_Erwinia_1uLSike_Top10HCD_isol2_45stepped_60min_01_sage_ms2rescore_snr.idXML",
-        ],
-    )
-
-    assert result.exit_code == 0
-
-
 def test_diann2mztab_example():
     runner = CliRunner()
     result = runner.invoke(

From 04d2af6e752c831138e950e88b2f70f087a19de7 Mon Sep 17 00:00:00 2001
From: Yasset Perez-Riverol <ypriverol@gmail.com>
Date: Thu, 20 Feb 2025 14:57:48 +0000
Subject: [PATCH 7/9] REmove the feature commands -> quantms-rescoring

---
 recipe/meta.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/recipe/meta.yaml b/recipe/meta.yaml
index f6c4210..bc30f7b 100644
--- a/recipe/meta.yaml
+++ b/recipe/meta.yaml
@@ -1,7 +1,7 @@
 # recipe/meta.yaml
 package:
   name: quantms-utils
-  version: "0.0.18"
+  version: "0.0.19"
 
 source:
   path: ../

From 09273b91bfef19f706a1e1fba0e74b032c7cf94e Mon Sep 17 00:00:00 2001
From: Yasset Perez-Riverol <ypriverol@gmail.com>
Date: Thu, 20 Feb 2025 16:12:41 +0000
Subject: [PATCH 8/9] REmove the feature commands -> quantms-rescoring

---
 quantmsutils/diann/diann2mztab.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/quantmsutils/diann/diann2mztab.py b/quantmsutils/diann/diann2mztab.py
index a130eb2..534a993 100644
--- a/quantmsutils/diann/diann2mztab.py
+++ b/quantmsutils/diann/diann2mztab.py
@@ -77,6 +77,7 @@ def diann2mztab(
     :type missed_cleavages: int
     :param qvalue_threshold: Threshold for filtering q value
     :type qvalue_threshold: float
+    :param enable_diann2mztab: Enable conversion to mzTab
     """
     logger.debug(f"Revision {REVISION}")
     logger.debug("Reading input files...")
@@ -714,11 +715,6 @@ def mztab_prh(report, pg, index_ref, database, fasta_df, diann_version):
         f" input index_ref shape: {index_ref.shape},"
         f" input fasta_df shape: {fasta_df.shape}"
     )
-    #  DIA-NN 2.0 PG file doesn't have Protein IDs column
-    if "2.0" not in diann_version:
-        file = list(pg.columns[5:])
-    else:
-        file = list(pg.columns[4:])
 
     logger.debug("Classifying results type ...")
     pg["opt_global_result_type"] = "single_protein"

From f72c96f61583b39364a95c70a1c4a1bf5fb04096 Mon Sep 17 00:00:00 2001
From: Yasset Perez-Riverol <ypriverol@gmail.com>
Date: Thu, 20 Feb 2025 16:21:24 +0000
Subject: [PATCH 9/9] commit unnused variable

---
 quantmsutils/diann/diann2mztab.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/quantmsutils/diann/diann2mztab.py b/quantmsutils/diann/diann2mztab.py
index 534a993..290f254 100644
--- a/quantmsutils/diann/diann2mztab.py
+++ b/quantmsutils/diann/diann2mztab.py
@@ -902,7 +902,6 @@ def mztab_peh(
         f" len(precursor_list): {len(precursor_list)},"
         f" index_ref.shape: {index_ref.shape}"
     )
-    out_mztab_peh = pd.DataFrame()
     out_mztab_peh = pr.iloc[:, 0:10]
     out_mztab_peh.drop(
         ["Protein.Ids", "Protein.Names", "First.Protein.Description", "Proteotypic"],