Skip to content

Commit

Permalink
feat: simplify mzml reader
Browse files Browse the repository at this point in the history
  • Loading branch information
tshauck committed Apr 22, 2024
1 parent 04be396 commit 4cd790a
Show file tree
Hide file tree
Showing 4 changed files with 74 additions and 222 deletions.
3 changes: 3 additions & 0 deletions exon/exon-core/tests/sqllogictests/slt/mzml-functions.slt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ control substitution on
statement ok
CREATE EXTERNAL TABLE mzml_table STORED AS MZML LOCATION '$CARGO_MANIFEST_DIR/test-data/datasources/mzml-pyoteomics/pyoteomics.mzML'

statement ok
SELECT * FROM mzml_table LIMIT 1;

query I
SELECT contains_peak(mz.mz, 200.0, 1.0) AS has_peak FROM mzml_table LIMIT 1;
----
Expand Down
241 changes: 63 additions & 178 deletions exon/exon-mzml/src/array_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ use std::sync::Arc;
use arrow::{
array::{
ArrayBuilder, ArrayRef, Float64Builder, GenericListBuilder, GenericStringBuilder,
ListBuilder, MapBuilder, StructBuilder,
Int64Builder, ListBuilder, StructBuilder,
},
datatypes::{DataType, Field, Fields},
};
Expand All @@ -28,6 +28,8 @@ use crate::mzml_reader::{
NO_COMPRESSION_MS_NUMBER, WAVE_LENGTH_ARRAY, ZLIB_COMPRESSION_MS_NUMBER,
};

// https://github.com/wfondrie/depthcharge/blob/d46adf12deba06fb5d1019eb6e7a2ff621bfb388/depthcharge/data/parsers.py#L253

use super::mzml_reader::binary_conversion::decode_binary_array;

pub struct MzMLArrayBuilder {
Expand All @@ -39,9 +41,11 @@ pub struct MzMLArrayBuilder {
intensity: StructBuilder,
wavelength: StructBuilder,

cv_params: MapBuilder<GenericStringBuilder<i32>, StructBuilder>,
// cv_params: MapBuilder<GenericStringBuilder<i32>, StructBuilder>,
cv_params: GenericListBuilder<i32, StructBuilder>,

precursor_list: GenericListBuilder<i32, StructBuilder>,
precursor_mz: Float64Builder,
precursor_charge: Int64Builder,
}

impl MzMLArrayBuilder {
Expand Down Expand Up @@ -79,105 +83,32 @@ impl MzMLArrayBuilder {
let wavelength_builder =
StructBuilder::new(wavelength_fields, vec![Box::new(wavelength_array_builder)]);

let cv_params_builder = MapBuilder::new(
None,
GenericStringBuilder::<i32>::new(),
StructBuilder::new(
Fields::from(vec![
Field::new("accession", DataType::Utf8, true),
Field::new("name", DataType::Utf8, true),
Field::new("value", DataType::Utf8, true),
]),
vec![
Box::new(GenericStringBuilder::<i32>::new()),
Box::new(GenericStringBuilder::<i32>::new()),
Box::new(GenericStringBuilder::<i32>::new()),
],
),
);

let cv_param_struct = Field::new(
"values",
DataType::Struct(Fields::from(vec![
let cv_param_builder = StructBuilder::new(
Fields::from(vec![
Field::new("accession", DataType::Utf8, true),
Field::new("name", DataType::Utf8, true),
Field::new("value", DataType::Utf8, true),
])),
true,
);

let cv_key_field = Field::new("keys", DataType::Utf8, false);

// A map of cvParams to their values (DataType::Utf8 to cvParamStruct)
let isolation_window = Field::new_map(
"isolation_window",
"entries",
cv_key_field.clone(),
cv_param_struct.clone(),
false,
true,
);

let activation = Field::new_map(
"activation",
"entries",
cv_key_field,
cv_param_struct,
false,
true,
]),
vec![
Box::new(GenericStringBuilder::<i32>::new()),
Box::new(GenericStringBuilder::<i32>::new()),
Box::new(GenericStringBuilder::<i32>::new()),
],
);

let precursor_fields = Fields::from(vec![isolation_window, activation]);

let isolation_window_builder = MapBuilder::new(
None,
GenericStringBuilder::<i32>::new(),
StructBuilder::new(
Fields::from(vec![
Field::new("accession", DataType::Utf8, true),
Field::new("name", DataType::Utf8, true),
Field::new("value", DataType::Utf8, true),
]),
vec![
Box::new(GenericStringBuilder::<i32>::new()),
Box::new(GenericStringBuilder::<i32>::new()),
Box::new(GenericStringBuilder::<i32>::new()),
],
),
);
let cv_params_builder = GenericListBuilder::new(cv_param_builder);

let activation = MapBuilder::new(
None,
GenericStringBuilder::<i32>::new(),
StructBuilder::new(
Fields::from(vec![
Field::new("accession", DataType::Utf8, true),
Field::new("name", DataType::Utf8, true),
Field::new("value", DataType::Utf8, true),
]),
vec![
Box::new(GenericStringBuilder::<i32>::new()),
Box::new(GenericStringBuilder::<i32>::new()),
Box::new(GenericStringBuilder::<i32>::new()),
],
),
);

let precursor_builder = StructBuilder::new(
precursor_fields,
vec![Box::new(isolation_window_builder), Box::new(activation)],
);
let precursor_mz = Float64Builder::new();

Self {
id: GenericStringBuilder::<i32>::new(),

mz: mz_builder,
intensity: intensity_builder,
wavelength: wavelength_builder,

cv_params: cv_params_builder,

precursor_list: GenericListBuilder::new(precursor_builder),
precursor_mz,
precursor_charge: Int64Builder::new(),
}
}

Expand Down Expand Up @@ -400,8 +331,6 @@ impl MzMLArrayBuilder {
self.id.append_value(&record.id);

for cv_param in &record.cv_param {
self.cv_params.keys().append_value(&cv_param.accession);

self.cv_params
.values()
.field_builder::<GenericStringBuilder<i32>>(0)
Expand All @@ -428,104 +357,58 @@ impl MzMLArrayBuilder {

self.cv_params.values().append(true);
}
self.cv_params.append(true).unwrap();
self.cv_params.append(true);

self.append_data_arrays(record)?;

let precursor_list_values = self.precursor_list.values();

match &record.precursor_list {
Some(precursor_list) => {
for precursor in &precursor_list.precursor {
let isolation_window_builder = precursor_list_values
.field_builder::<MapBuilder<GenericStringBuilder<i32>, StructBuilder>>(0)
.unwrap();

match &precursor.isolation_window {
Some(isolation_window) => {
for cv_param in &isolation_window.cv_param {
isolation_window_builder
.keys()
.append_value(&cv_param.accession);

isolation_window_builder
.values()
.field_builder::<GenericStringBuilder<i32>>(0)
.unwrap()
.append_value(&cv_param.accession);

isolation_window_builder
.values()
.field_builder::<GenericStringBuilder<i32>>(1)
.unwrap()
.append_value(&cv_param.name);

let cv_value = cv_param.value.as_ref().map(|v| v.to_string());
isolation_window_builder
.values()
.field_builder::<GenericStringBuilder<i32>>(2)
.unwrap()
.append_option(cv_value);

isolation_window_builder.values().append(true);
}
isolation_window_builder.append(true).unwrap();
}
None => {
isolation_window_builder.append(false).unwrap();
let precursor = &precursor_list.precursor[0];
let selected_ion = &precursor.selected_ion_list.selected_ion[0];

let selected_ion_mz = selected_ion.cv_param.iter().find_map(|f| {
if f.accession == "MS:1000744" {
if let Some(value) = &f.value {
let string_value = value.to_string();
let float_value = string_value.parse::<f64>().unwrap();

Some(float_value)
} else {
None
}
};

let activation_builder = precursor_list_values
.field_builder::<MapBuilder<GenericStringBuilder<i32>, StructBuilder>>(1)
.unwrap();

for cv_param in &precursor.activation.cv_param {
activation_builder.keys().append_value(&cv_param.accession);

activation_builder
.values()
.field_builder::<GenericStringBuilder<i32>>(0)
.unwrap()
.append_value(&cv_param.accession);

activation_builder
.values()
.field_builder::<GenericStringBuilder<i32>>(1)
.unwrap()
.append_value(&cv_param.name);

activation_builder
.values()
.field_builder::<GenericStringBuilder<i32>>(2)
.unwrap()
.append_null();

activation_builder.values().append(true);
} else {
None
}
});

activation_builder.append(true).unwrap();
if let Some(selected_ion_mz) = selected_ion_mz {
self.precursor_mz.append_value(selected_ion_mz);
} else {
self.precursor_mz.append_null();
}

precursor_list_values.append(true);
self.precursor_list.append(true);
let charge_state = selected_ion.cv_param.iter().find_map(|f| {
if f.accession == "MS:1000041" {
if let Some(value) = &f.value {
Some(value)
} else {
None
}
} else {
None
}
});

if let Some(charge_state) = charge_state {
let charge_state = charge_state.parse::<i64>().unwrap();
self.precursor_charge.append_value(charge_state);
} else {
self.precursor_charge.append_null();
}
}
None => {
let isolation_window_builder = precursor_list_values
.field_builder::<MapBuilder<GenericStringBuilder<i32>, StructBuilder>>(0)
.unwrap();

isolation_window_builder.append(true).unwrap();

let activation_builder = precursor_list_values
.field_builder::<MapBuilder<GenericStringBuilder<i32>, StructBuilder>>(1)
.unwrap();

activation_builder.append(true).unwrap();

precursor_list_values.append_null();

self.precursor_list.append_null();
self.precursor_mz.append_null();
self.precursor_charge.append_null();
}
}

Expand All @@ -540,15 +423,17 @@ impl MzMLArrayBuilder {

let cv_params = self.cv_params.finish();

let precursor_list = self.precursor_list.finish();
let precursor_mz = self.precursor_mz.finish();
let precursor_charge = self.precursor_charge.finish();

vec![
Arc::new(id),
Arc::new(mz),
Arc::new(intensity),
Arc::new(wavelength),
Arc::new(cv_params),
Arc::new(precursor_list),
Arc::new(precursor_mz),
Arc::new(precursor_charge),
]
}
}
1 change: 1 addition & 0 deletions exon/exon-mzml/src/batch_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ where

let batch: RecordBatch =
RecordBatch::try_new(self.config.file_schema.clone(), array_builder.finish())?;

match &self.config.projection {
Some(projection) => {
let projected_batch = batch.project(projection)?;
Expand Down
Loading

0 comments on commit 4cd790a

Please sign in to comment.