mumble related changes

compomics · Jan 17, 2025 · ca9da7d · ca9da7d
1 parent 21cafc7
commit ca9da7d
Show file tree

Hide file tree

Showing 4 changed files with 97 additions and 69 deletions.
diff --git a/ms2rescore/core.py b/ms2rescore/core.py
@@ -111,8 +111,10 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
     psm_list = psm_list[psms_with_features]
 
     if "mumble" in config["psm_generator"]:
-        # Remove PSMS that have a less matched ions than the original hit
-        psm_list = filter_mumble_psms(psm_list)
+        # Remove PSMs where matched_ions_pct drops 25% below the original hit
+        psm_list = filter_mumble_psms(psm_list, threshold=0.75)
+        # Currently replace the score with the hyperscore for Mumble
+        # psm_list["score"] = [ft["hyperscore"] for ft in psm_list["rescoring_features"]] # TODO: This is a temporary fix
 
     # Write feature names to file
     _write_feature_names(feature_names, output_file_root)
@@ -288,7 +290,9 @@ def _calculate_confidence(psm_list: PSMList) -> PSMList:
     )
 
     # Recalculate confidence
-    new_confidence = lin_psm_data.assign_confidence(scores=psm_list["score"])
+    new_confidence = lin_psm_data.assign_confidence(
+        scores=list(psm_list["score"])
+    )  # explicity make it a list to avoid TypingError: Failed in nopython mode pipeline (step: nopython frontend) in mokapot
 
     # Add new confidence estimations to PSMList
     add_psm_confidence(psm_list, new_confidence)

diff --git a/ms2rescore/feature_generators/ms2.py b/ms2rescore/feature_generators/ms2.py
@@ -7,13 +7,12 @@
 import re
 from collections import defaultdict
 from itertools import chain
-from typing import List, Optional, Union
+from typing import List, Optional
 
 import numpy as np
 from psm_utils import PSMList
-from pyteomics import mass, mgf, mzml
 from rustyms import FragmentationModel, LinearPeptide, MassMode, RawSpectrum
-from pathlib import Path
+from ms2rescore_rs import get_ms2_spectra
 
 from ms2rescore.exceptions import ParseSpectrumError
 from ms2rescore.feature_generators.base import FeatureGeneratorBase
@@ -68,7 +67,6 @@ def __init__(
         self.processes = processes
         self.calculate_hyperscore = calculate_hyperscore
 
-
     @property
     def feature_names(self) -> List[str]:
         return [
@@ -109,26 +107,22 @@ def add_features(self, psm_list: PSMList) -> None:
     def _calculate_features(self, psm_list: PSMList, spectrum_file: str) -> List:
         """Retrieve annotated spectra for all psms."""
 
-        spectrum_reader = self._read_spectrum_file(spectrum_file)
-
+        spectra = get_ms2_spectra(str(spectrum_file))
         spectrum_id_pattern = re.compile(
             self.spectrum_id_pattern if self.spectrum_id_pattern else r"(.*)"
         )
-
         try:
-            mapper = {
-                spectrum_id_pattern.search(spectrum_id).group(1): spectrum_id
-                for spectrum_id in spectrum_reader._offset_index.mapping["spectrum"].keys()
+            spectra_dict = {
+                spectrum_id_pattern.search(spectrum.identifier).group(1): spectrum
+                for spectrum in spectra
             }
         except AttributeError:
             raise ParseSpectrumError(
                 "Could not parse spectrum IDs using ´spectrum_id_pattern´. Please make sure that there is a capturing in the pattern."
             )
-        annotated_spectra = [
-            self._annotate_spectrum(psm, spectrum_reader.get_by_id(mapper[psm.spectrum_id]))
-            for psm in psm_list
-        ]
-        for psm, annotated_spectrum in zip(psm_list, annotated_spectra):
+
+        for psm in psm_list:
+            annotated_spectrum = self._annotate_spectrum(psm, spectra_dict[psm.spectrum_id])
             psm.rescoring_features.update(
                 self._calculate_spectrum_features(psm, annotated_spectrum)
             )
@@ -137,20 +131,9 @@ def _calculate_features(self, psm_list: PSMList, spectrum_file: str) -> List:
                 # Filters out peaks which are unnannotated (can be specified, but keep at b-y ions of any charge ?)
                 b, y = get_by_fragments(annotated_spectrum)
                 hs = calculate_hyperscore(
-                    n_y=len(y),
-                    n_b=len(b),
-                    y_ion_intensities=y,
-                    b_ion_intensities=b
+                    n_y=len(y), n_b=len(b), y_ion_intensities=y, b_ion_intensities=b
                 )
-                psm.rescoring_features.update({'hyperscore': hs})
-
-    @staticmethod
-    def _read_spectrum_file(spectrum_filepath: Path) -> Union[mzml.PreIndexedMzML, mgf.IndexedMGF]:
-
-        if spectrum_filepath.suffix.lower() == ".mzml":
-            return mzml.PreIndexedMzML(str(spectrum_filepath))
-        elif spectrum_filepath.suffix.lower() == ".mgf":
-            return mgf.IndexedMGF(str(spectrum_filepath))
+                psm.rescoring_features.update({"hyperscore": hs})
 
     @staticmethod
     def _longest_ion_sequence(lst):
@@ -219,16 +202,16 @@ def _calculate_spectrum_features(self, psm, annotated_spectrum):
             / (len(b_ions_matched) + len(y_ions_matched)),
         }
 
-    def _annotate_spectrum(self, psm, pyteomics_spectrum):
+    def _annotate_spectrum(self, psm, spectrum):
 
         spectrum = RawSpectrum(
             title=psm.spectrum_id,
             num_scans=1,
             rt=psm.retention_time,
             precursor_charge=psm.get_precursor_charge(),
-            precursor_mass=mass.calculate_mass(psm.peptidoform.composition),
-            mz_array=pyteomics_spectrum["m/z array"],
-            intensity_array=pyteomics_spectrum["intensity array"],
+            precursor_mass=spectrum.precursor.mz,
+            mz_array=spectrum.mz,
+            intensity_array=spectrum.intensity,
         )
         try:
             linear_peptide = LinearPeptide(psm.peptidoform.proforma.split("/")[0])
@@ -242,7 +225,7 @@ def _annotate_spectrum(self, psm, pyteomics_spectrum):
 
         return annotated_spectrum.spectrum
 
- 
+
 def factorial(n):
     """
     Compute factorial of n using a loop.
@@ -255,7 +238,8 @@ def factorial(n):
     for i in range(1, n + 1):
         result *= i
     return result
-
+
+
 def calculate_hyperscore(n_y, n_b, y_ion_intensities, b_ion_intensities):
     """
     Calculate the hyperscore for a peptide-spectrum match.
@@ -270,24 +254,25 @@ def calculate_hyperscore(n_y, n_b, y_ion_intensities, b_ion_intensities):
     # Calculate the product of y-ion and b-ion intensities
     product_y = np.sum(y_ion_intensities) if y_ion_intensities else 1
     product_b = np.sum(b_ion_intensities) if b_ion_intensities else 1
- 
+
     # Calculate factorial using custom function
     factorial_y = factorial(n_y)
     factorial_b = factorial(n_b)
- 
+
     # Compute hyperscore
-    hyperscore = np.log(factorial_y * factorial_b * (product_y+product_b))
+    hyperscore = np.log(factorial_y * factorial_b * (product_y + product_b))
     return hyperscore
 
-def infer_fragment_identity(frag, allow_ion_types=['b', 'y']):
+
+def infer_fragment_identity(frag, allow_ion_types=["b", "y"]):
     ion = frag.ion
 
     is_allowed = False
     for allowed_ion_type in allow_ion_types:
         if allowed_ion_type in ion:
-            is_allowed=True
+            is_allowed = True
             break
-    
+
     if not is_allowed:
         return False
     # Radicals
@@ -297,20 +282,21 @@ def infer_fragment_identity(frag, allow_ion_types=['b', 'y']):
         return False
     if frag.charge > 2:
         return False
-    
+
     return ion[0]
 
+
 def get_by_fragments(annotated_spectrum):
     b_intensities = []
     y_intensities = []
     for peak in annotated_spectrum:
-        
+
         for fragment in peak.annotation:
 
             ion_type = infer_fragment_identity(fragment)
-            
-            if ion_type == 'b':
+
+            if ion_type == "b":
                 b_intensities.append(peak.intensity)
-            if ion_type == 'y':
+            if ion_type == "y":
                 y_intensities.append(peak.intensity)
     return b_intensities, y_intensities
diff --git a/ms2rescore/parse_psms.py b/ms2rescore/parse_psms.py
@@ -94,6 +94,19 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList:
     # Addition of Modifications for mass shifts in the PSMs with Mumble
     if "mumble" in config["psm_generator"]:
         logger.debug("Applying modifications for mass shifts using Mumble...")
+
+        # set inlcude original psm to True and include decoy psm to true
+        if (
+            "include_original_psm" not in config["psm_generator"]["mumble"]
+            or not config["psm_generator"]["mumble"]
+        ):
+            config["psm_generator"]["mumble"]["include_original_psm"] = True
+        if (
+            "include_decoy_psm" not in config["psm_generator"]["mumble"]
+            or not config["psm_generator"]["mumble"]
+        ):
+            config["psm_generator"]["mumble"]["include_decoy_psm"] = True
+
         mumble_config = config["psm_generator"]["mumble"]
 
         # Check if psm_list[0].rescoring_features is empty or not

diff --git a/ms2rescore/utils.py b/ms2rescore/utils.py
@@ -95,30 +95,55 @@ def _is_minitdf(spectrum_file: str) -> bool:
     return len(files) >= 2
 
 
-def filter_mumble_psms(psm_list: PSMList) -> PSMList:
+def filter_mumble_psms(psm_list: PSMList, threshold=1) -> PSMList:
     """
-    Filter out PSMs with mumble in the peptide sequence.
+    Filter out mumble PSMs with `matched_ions_pct` lower than the original hit.
+
+    Parameters
+    ----------
+    psm_list : PSMList
+        List of PSMs to filter
+    threshold : float, optional
+        Threshold to lower the maximum matched_ions_pct of the original hit
     """
-    keep = [None] * len(psm_list)
-    original_hit = np.array([metadata.get("original_hit") for metadata in psm_list["metadata"]])
+    # Extract relevant fields from the PSM list
+    original_hit = np.array([metadata.get("original_psm") for metadata in psm_list["metadata"]])
     spectrum_indices = np.array([psm.spectrum_id for psm in psm_list])
     runs = np.array([psm.run for psm in psm_list])
-    if "matched_ions_pct" in psm_list[0].rescoring_features:
-        matched_ions = [psm.rescoring_features["matched_ions_pct"] for psm in psm_list]
-    else:
+
+    # Check if matched_ions_pct exists
+    if "matched_ions_pct" not in psm_list[0].rescoring_features:
         return psm_list
-    for i, psm in enumerate(psm_list):
-        if isinstance(keep[i], bool):
-            continue
-        elif original_hit[i]:
-            keep[i] = True
-        else:
-            original_matched_ions_pct = np.logical_and.reduce(
-                [original_hit, spectrum_indices == psm.spectrum_id, runs == psm.run]
-            )
-            if original_matched_ions_pct > matched_ions[i]:
-                keep[i] = False
-            else:
-                keep[i] = True
-    logger.debug(f"Filtered out {len(psm_list) - sum(keep)} mumble PSMs.")
+
+    matched_ions = np.array([psm.rescoring_features["matched_ions_pct"] for psm in psm_list])
+
+    # Create unique keys for each (run, spectrum_id)
+    unique_keys = np.core.defchararray.add(runs.astype(str), spectrum_indices.astype(str))
+    unique_keys_indices, inverse_indices = np.unique(unique_keys, return_inverse=True)
+
+    # Initialize an array to store the `matched_ions_pct` of original hits per group
+    original_matched_ions_pct = np.full(
+        len(unique_keys_indices), -np.inf
+    )  # Default to -inf for groups without original hits
+
+    # Assign the `matched_ions_pct` of original hits to their groups
+    np.maximum.at(
+        original_matched_ions_pct, inverse_indices[original_hit], matched_ions[original_hit]
+    )
+
+    # lower the maximum with the threshold
+    original_matched_ions_pct = original_matched_ions_pct * threshold
+
+    # Broadcast the original `matched_ions_pct` back to all PSMs in each group
+    original_matched_ions_for_all = original_matched_ions_pct[inverse_indices]
+
+    # Determine the filtering condition
+    keep = np.logical_or(
+        original_hit,  # Always keep original hits
+        matched_ions
+        >= original_matched_ions_for_all,  # Keep hits with `matched_ions_pct` >= the original
+    )
+
+    # Filter PSMs
+    logger.debug(f"Filtered out {len(psm_list) - np.sum(keep)} mumble PSMs.")
     return psm_list[keep]