Skip to content

Commit

Permalink
mumble related changes
Browse files Browse the repository at this point in the history
  • Loading branch information
ArthurDeclercq committed Jan 17, 2025
1 parent 21cafc7 commit ca9da7d
Show file tree
Hide file tree
Showing 4 changed files with 97 additions and 69 deletions.
10 changes: 7 additions & 3 deletions ms2rescore/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,8 +111,10 @@ def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
psm_list = psm_list[psms_with_features]

if "mumble" in config["psm_generator"]:
# Remove PSMS that have a less matched ions than the original hit
psm_list = filter_mumble_psms(psm_list)
# Remove PSMs where matched_ions_pct drops 25% below the original hit
psm_list = filter_mumble_psms(psm_list, threshold=0.75)
# Currently replace the score with the hyperscore for Mumble
# psm_list["score"] = [ft["hyperscore"] for ft in psm_list["rescoring_features"]] # TODO: This is a temporary fix

# Write feature names to file
_write_feature_names(feature_names, output_file_root)
Expand Down Expand Up @@ -288,7 +290,9 @@ def _calculate_confidence(psm_list: PSMList) -> PSMList:
)

# Recalculate confidence
new_confidence = lin_psm_data.assign_confidence(scores=psm_list["score"])
new_confidence = lin_psm_data.assign_confidence(
scores=list(psm_list["score"])
) # explicity make it a list to avoid TypingError: Failed in nopython mode pipeline (step: nopython frontend) in mokapot

# Add new confidence estimations to PSMList
add_psm_confidence(psm_list, new_confidence)
Expand Down
76 changes: 31 additions & 45 deletions ms2rescore/feature_generators/ms2.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,12 @@
import re
from collections import defaultdict
from itertools import chain
from typing import List, Optional, Union
from typing import List, Optional

import numpy as np
from psm_utils import PSMList
from pyteomics import mass, mgf, mzml
from rustyms import FragmentationModel, LinearPeptide, MassMode, RawSpectrum
from pathlib import Path
from ms2rescore_rs import get_ms2_spectra

from ms2rescore.exceptions import ParseSpectrumError
from ms2rescore.feature_generators.base import FeatureGeneratorBase
Expand Down Expand Up @@ -68,7 +67,6 @@ def __init__(
self.processes = processes
self.calculate_hyperscore = calculate_hyperscore


@property
def feature_names(self) -> List[str]:
return [
Expand Down Expand Up @@ -109,26 +107,22 @@ def add_features(self, psm_list: PSMList) -> None:
def _calculate_features(self, psm_list: PSMList, spectrum_file: str) -> List:
"""Retrieve annotated spectra for all psms."""

spectrum_reader = self._read_spectrum_file(spectrum_file)

spectra = get_ms2_spectra(str(spectrum_file))
spectrum_id_pattern = re.compile(
self.spectrum_id_pattern if self.spectrum_id_pattern else r"(.*)"
)

try:
mapper = {
spectrum_id_pattern.search(spectrum_id).group(1): spectrum_id
for spectrum_id in spectrum_reader._offset_index.mapping["spectrum"].keys()
spectra_dict = {
spectrum_id_pattern.search(spectrum.identifier).group(1): spectrum
for spectrum in spectra
}
except AttributeError:
raise ParseSpectrumError(
"Could not parse spectrum IDs using ´spectrum_id_pattern´. Please make sure that there is a capturing in the pattern."
)
annotated_spectra = [
self._annotate_spectrum(psm, spectrum_reader.get_by_id(mapper[psm.spectrum_id]))
for psm in psm_list
]
for psm, annotated_spectrum in zip(psm_list, annotated_spectra):

for psm in psm_list:
annotated_spectrum = self._annotate_spectrum(psm, spectra_dict[psm.spectrum_id])
psm.rescoring_features.update(
self._calculate_spectrum_features(psm, annotated_spectrum)
)
Expand All @@ -137,20 +131,9 @@ def _calculate_features(self, psm_list: PSMList, spectrum_file: str) -> List:
# Filters out peaks which are unnannotated (can be specified, but keep at b-y ions of any charge ?)
b, y = get_by_fragments(annotated_spectrum)
hs = calculate_hyperscore(
n_y=len(y),
n_b=len(b),
y_ion_intensities=y,
b_ion_intensities=b
n_y=len(y), n_b=len(b), y_ion_intensities=y, b_ion_intensities=b
)
psm.rescoring_features.update({'hyperscore': hs})

@staticmethod
def _read_spectrum_file(spectrum_filepath: Path) -> Union[mzml.PreIndexedMzML, mgf.IndexedMGF]:

if spectrum_filepath.suffix.lower() == ".mzml":
return mzml.PreIndexedMzML(str(spectrum_filepath))
elif spectrum_filepath.suffix.lower() == ".mgf":
return mgf.IndexedMGF(str(spectrum_filepath))
psm.rescoring_features.update({"hyperscore": hs})

@staticmethod
def _longest_ion_sequence(lst):
Expand Down Expand Up @@ -219,16 +202,16 @@ def _calculate_spectrum_features(self, psm, annotated_spectrum):
/ (len(b_ions_matched) + len(y_ions_matched)),
}

def _annotate_spectrum(self, psm, pyteomics_spectrum):
def _annotate_spectrum(self, psm, spectrum):

spectrum = RawSpectrum(
title=psm.spectrum_id,
num_scans=1,
rt=psm.retention_time,
precursor_charge=psm.get_precursor_charge(),
precursor_mass=mass.calculate_mass(psm.peptidoform.composition),
mz_array=pyteomics_spectrum["m/z array"],
intensity_array=pyteomics_spectrum["intensity array"],
precursor_mass=spectrum.precursor.mz,
mz_array=spectrum.mz,
intensity_array=spectrum.intensity,
)
try:
linear_peptide = LinearPeptide(psm.peptidoform.proforma.split("/")[0])
Expand All @@ -242,7 +225,7 @@ def _annotate_spectrum(self, psm, pyteomics_spectrum):

return annotated_spectrum.spectrum


def factorial(n):
"""
Compute factorial of n using a loop.
Expand All @@ -255,7 +238,8 @@ def factorial(n):
for i in range(1, n + 1):
result *= i
return result



def calculate_hyperscore(n_y, n_b, y_ion_intensities, b_ion_intensities):
"""
Calculate the hyperscore for a peptide-spectrum match.
Expand All @@ -270,24 +254,25 @@ def calculate_hyperscore(n_y, n_b, y_ion_intensities, b_ion_intensities):
# Calculate the product of y-ion and b-ion intensities
product_y = np.sum(y_ion_intensities) if y_ion_intensities else 1
product_b = np.sum(b_ion_intensities) if b_ion_intensities else 1

# Calculate factorial using custom function
factorial_y = factorial(n_y)
factorial_b = factorial(n_b)

# Compute hyperscore
hyperscore = np.log(factorial_y * factorial_b * (product_y+product_b))
hyperscore = np.log(factorial_y * factorial_b * (product_y + product_b))
return hyperscore

def infer_fragment_identity(frag, allow_ion_types=['b', 'y']):

def infer_fragment_identity(frag, allow_ion_types=["b", "y"]):
ion = frag.ion

is_allowed = False
for allowed_ion_type in allow_ion_types:
if allowed_ion_type in ion:
is_allowed=True
is_allowed = True
break

if not is_allowed:
return False
# Radicals
Expand All @@ -297,20 +282,21 @@ def infer_fragment_identity(frag, allow_ion_types=['b', 'y']):
return False
if frag.charge > 2:
return False

return ion[0]


def get_by_fragments(annotated_spectrum):
b_intensities = []
y_intensities = []
for peak in annotated_spectrum:

for fragment in peak.annotation:

ion_type = infer_fragment_identity(fragment)
if ion_type == 'b':

if ion_type == "b":
b_intensities.append(peak.intensity)
if ion_type == 'y':
if ion_type == "y":
y_intensities.append(peak.intensity)
return b_intensities, y_intensities
13 changes: 13 additions & 0 deletions ms2rescore/parse_psms.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,19 @@ def parse_psms(config: Dict, psm_list: Union[PSMList, None]) -> PSMList:
# Addition of Modifications for mass shifts in the PSMs with Mumble
if "mumble" in config["psm_generator"]:
logger.debug("Applying modifications for mass shifts using Mumble...")

# set inlcude original psm to True and include decoy psm to true
if (
"include_original_psm" not in config["psm_generator"]["mumble"]
or not config["psm_generator"]["mumble"]
):
config["psm_generator"]["mumble"]["include_original_psm"] = True
if (
"include_decoy_psm" not in config["psm_generator"]["mumble"]
or not config["psm_generator"]["mumble"]
):
config["psm_generator"]["mumble"]["include_decoy_psm"] = True

mumble_config = config["psm_generator"]["mumble"]

# Check if psm_list[0].rescoring_features is empty or not
Expand Down
67 changes: 46 additions & 21 deletions ms2rescore/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,30 +95,55 @@ def _is_minitdf(spectrum_file: str) -> bool:
return len(files) >= 2


def filter_mumble_psms(psm_list: PSMList) -> PSMList:
def filter_mumble_psms(psm_list: PSMList, threshold=1) -> PSMList:
"""
Filter out PSMs with mumble in the peptide sequence.
Filter out mumble PSMs with `matched_ions_pct` lower than the original hit.
Parameters
----------
psm_list : PSMList
List of PSMs to filter
threshold : float, optional
Threshold to lower the maximum matched_ions_pct of the original hit
"""
keep = [None] * len(psm_list)
original_hit = np.array([metadata.get("original_hit") for metadata in psm_list["metadata"]])
# Extract relevant fields from the PSM list
original_hit = np.array([metadata.get("original_psm") for metadata in psm_list["metadata"]])
spectrum_indices = np.array([psm.spectrum_id for psm in psm_list])
runs = np.array([psm.run for psm in psm_list])
if "matched_ions_pct" in psm_list[0].rescoring_features:
matched_ions = [psm.rescoring_features["matched_ions_pct"] for psm in psm_list]
else:

# Check if matched_ions_pct exists
if "matched_ions_pct" not in psm_list[0].rescoring_features:
return psm_list
for i, psm in enumerate(psm_list):
if isinstance(keep[i], bool):
continue
elif original_hit[i]:
keep[i] = True
else:
original_matched_ions_pct = np.logical_and.reduce(
[original_hit, spectrum_indices == psm.spectrum_id, runs == psm.run]
)
if original_matched_ions_pct > matched_ions[i]:
keep[i] = False
else:
keep[i] = True
logger.debug(f"Filtered out {len(psm_list) - sum(keep)} mumble PSMs.")

matched_ions = np.array([psm.rescoring_features["matched_ions_pct"] for psm in psm_list])

# Create unique keys for each (run, spectrum_id)
unique_keys = np.core.defchararray.add(runs.astype(str), spectrum_indices.astype(str))
unique_keys_indices, inverse_indices = np.unique(unique_keys, return_inverse=True)

# Initialize an array to store the `matched_ions_pct` of original hits per group
original_matched_ions_pct = np.full(
len(unique_keys_indices), -np.inf
) # Default to -inf for groups without original hits

# Assign the `matched_ions_pct` of original hits to their groups
np.maximum.at(
original_matched_ions_pct, inverse_indices[original_hit], matched_ions[original_hit]
)

# lower the maximum with the threshold
original_matched_ions_pct = original_matched_ions_pct * threshold

# Broadcast the original `matched_ions_pct` back to all PSMs in each group
original_matched_ions_for_all = original_matched_ions_pct[inverse_indices]

# Determine the filtering condition
keep = np.logical_or(
original_hit, # Always keep original hits
matched_ions
>= original_matched_ions_for_all, # Keep hits with `matched_ions_pct` >= the original
)

# Filter PSMs
logger.debug(f"Filtered out {len(psm_list) - np.sum(keep)} mumble PSMs.")
return psm_list[keep]

0 comments on commit ca9da7d

Please sign in to comment.