Skip to content

Commit

Permalink
major changes in ms structure to include precursors.
Browse files Browse the repository at this point in the history
  • Loading branch information
ypriverol committed Mar 7, 2025
1 parent d6a4a6b commit 21f43f0
Show file tree
Hide file tree
Showing 8 changed files with 52 additions and 32 deletions.
2 changes: 1 addition & 1 deletion quantmsutils/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.0.19"
__version__ = "0.0.20"
1 change: 1 addition & 0 deletions quantmsutils/diann/dianncfg.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
License: Apache 2.0
Authors: Dai Chengxin, Yasset Perez-Riverol
"""

import logging
import re
from typing import List, Tuple
Expand Down
67 changes: 42 additions & 25 deletions quantmsutils/mzml/mzml_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
import pyarrow as pa
import pyarrow.parquet as pq
from pyopenms import MzMLFile
import pyopenms as oms

from quantmsutils.utils.constants import (
CHARGE,
Expand All @@ -26,12 +25,14 @@
INTENSITY_ARRAY,
MONOISOTOPIC_MZ,
MAX_INTENSITY,
PRECURSORS, INTENSITY,
PRECURSORS,
INTENSITY,
)

logging.basicConfig(format="%(asctime)s [%(funcName)s] - %(message)s", level=logging.DEBUG)
logger = logging.getLogger(__name__)


class BatchWritingConsumer:
"""
A class to consume mass spectrometry data and write to a parquet file in batches from mzML files using
Expand Down Expand Up @@ -96,7 +97,7 @@ def consumeSpectrum(self, spectrum):
) # Get the maximum intensity
total_intensity = (
float(np.sum(intensity_array)) if peak_per_ms > 0 else None
) # Get the total intensity
) # Get the total intensity # TODO: Review by @timo and @julianus
ms_level = spectrum.getMSLevel() # Get the MS level of the spectrum
rt = spectrum.getRT() # Get the retention time of the spectrum

Expand All @@ -107,29 +108,27 @@ def consumeSpectrum(self, spectrum):
intensity = precursor.getIntensity() # Intensity of first precursor
precursors = []
# capture if more than one precursor
if len(spectrum.getPrecursors()) > 1:
index = 0
for precursor in spectrum.getPrecursors():
logging.info(
"More than one precursor found in spectrum %s", spectrum.getNativeID()
"Precursor charge: %s, precursor mz: %s",
precursor.getCharge(),
precursor.getMZ(),
)
index = 0
for precursor in spectrum.getPrecursors():
logging.info(
"Precursor charge: %s, precursor mz: %s",
precursor.getCharge(),
precursor.getMZ(),
)
charge_state = precursor.getCharge()
exp_mz = precursor.getMZ()
intensity = precursor.getIntensity()
precursors.append(
{
"charge": charge_state,
"mz": exp_mz,
"intensity": intensity,
"index": index,
}
)
index += 1
charge_state = precursor.getCharge() # TODO: Review by @timo and @julianus
exp_mz = precursor.getMZ() # TODO: Review by @timo and @julianus
intensity = precursor.getIntensity() # TODO: Review by @timo and @julianus
rt = spectrum.getRT() # TODO: Review by @timo and @julianus
precursors.append(
{
"charge": charge_state,
"mz": exp_mz,
"intensity": intensity,
"rt": rt,
"index": index,
}
)
index += 1

if self.id_only:
scan_id = self.scan_pattern.findall(spectrum.getNativeID())[0]
Expand All @@ -155,6 +154,7 @@ def consumeSpectrum(self, spectrum):
),
RETENTION_TIME: float(rt),
EXPERIMENTAL_MASS_TO_CHARGE: float(exp_mz) if exp_mz is not None else None,
INTENSITY: float(intensity) if intensity is not None else None,
ACQUISITION_DATETIME: str(self.acquisition_datetime),
PRECURSORS: precursors,
}
Expand Down Expand Up @@ -289,6 +289,22 @@ def mzml_statistics(ctx, ms_path: str, id_only: bool = False, batch_size: int =
pa.field(SUMMED_PEAK_INTENSITY, pa.float64(), nullable=True),
pa.field(RETENTION_TIME, pa.float64(), nullable=True),
pa.field(EXPERIMENTAL_MASS_TO_CHARGE, pa.float64(), nullable=True),
pa.field(INTENSITY, pa.float64(), nullable=True),
pa.field(
PRECURSORS,
pa.list_(
pa.struct(
[
("charge", pa.int32()),
("mz", pa.float64()),
("intensity", pa.float64()),
("rt", pa.float64()),
("index", pa.int32()),
]
)
),
nullable=True,
),
pa.field(ACQUISITION_DATETIME, pa.string(), nullable=True),
]
)
Expand Down Expand Up @@ -377,8 +393,8 @@ def batch_write_bruker_d(file_name: str, output_path: str, batch_size: int = 100
pa.field(SUMMED_PEAK_INTENSITY, pa.float64(), nullable=True),
pa.field(RETENTION_TIME, pa.float64(), nullable=True),
pa.field(CHARGE, pa.int32(), nullable=True),
pa.field(INTENSITY, pa.float64(), nullable=True),
pa.field(MONOISOTOPIC_MZ, pa.float64(), nullable=True),
pa.field(INTENSITY, pa.float64(), nullable=True),
pa.field(
PRECURSORS,
pa.list_(
Expand All @@ -387,6 +403,7 @@ def batch_write_bruker_d(file_name: str, output_path: str, batch_size: int = 100
("charge", pa.int32()),
("mz", pa.float64()),
("intensity", pa.float64()),
("rt", pa.float64()),
("index", pa.int32()),
]
)
Expand Down
1 change: 1 addition & 0 deletions quantmsutils/psm/psm_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
logging.basicConfig(format="%(asctime)s [%(funcName)s] - %(message)s", level=logging.DEBUG)
logger = logging.getLogger(__name__)


def mods_position(peptide):
if peptide.startswith("."):
peptide = peptide[1:]
Expand Down
1 change: 1 addition & 0 deletions quantmsutils/sdrf/check_samplesheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
logging.basicConfig(format="%(asctime)s [%(funcName)s] - %(message)s", level=logging.DEBUG)
logger = logging.getLogger(__name__)


def make_dir(path):
if len(path) > 0:
try:
Expand Down
1 change: 1 addition & 0 deletions quantmsutils/sdrf/extract_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
logging.basicConfig(format="%(asctime)s [%(funcName)s] - %(message)s", level=logging.DEBUG)
logger = logging.getLogger(__name__)


@click.command(
"openms2sample",
short_help="Extract sample information from an experiment design file",
Expand Down
2 changes: 1 addition & 1 deletion quantmsutils/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
MAX_INTENSITY = "max_intensity"
RETENTION_TIME = "rt"
PRECURSORS = "precursors"
INTENSITY = "intensity"
INTENSITY = "intensity"
EXPERIMENTAL_MASS_TO_CHARGE = "observed_mz"
ACQUISITION_DATETIME = "acquisition_datetime"
MONOISOTOPIC_MZ = "monoisotopic_mz"
Expand Down
9 changes: 4 additions & 5 deletions tests/test_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

TESTS_DIR = Path(__file__).parent


# test for the create_diann_cfg command in cli
def test_create_diann_cfg_help():
runner = CliRunner()
Expand Down Expand Up @@ -145,18 +146,16 @@ def test_mzml_statistics():
runner = CliRunner()

mzml_path = TESTS_DIR / "test_data" / "BSA1_F1.mzML"
result = runner.invoke(
cli, ["mzmlstats", "--id_only", "--ms_path", mzml_path]
)
result = runner.invoke(cli, ["mzmlstats", "--id_only", "--ms_path", mzml_path])

ms_info_path = TESTS_DIR / "test_data" / "BSA1_F1_ms_info.parquet"
table2 = pd.read_parquet(ms_info_path)
table1 = pd.read_parquet(TESTS_DIR / "test_data" / "BSA1_F1_test_ms_info.parquet")
table2 = pd.read_parquet(ms_info_path)
table2 = table2.set_index("scan")
table1 = table1.set_index("scan")

assert table1.compare(table2).empty
assert len(table2) == len(table1)


id_table = pd.read_parquet("BSA1_F1_spectrum_df.parquet")
id_table2 = pd.read_parquet("tests/test_data/BSA1_F1_spectrum_df.parquet")
Expand Down

0 comments on commit 21f43f0

Please sign in to comment.