diff --git a/exon/exon-core/tests/sqllogictests/slt/mzml-functions.slt b/exon/exon-core/tests/sqllogictests/slt/mzml-functions.slt index 708cc524..8f78a690 100644 --- a/exon/exon-core/tests/sqllogictests/slt/mzml-functions.slt +++ b/exon/exon-core/tests/sqllogictests/slt/mzml-functions.slt @@ -3,6 +3,9 @@ control substitution on statement ok CREATE EXTERNAL TABLE mzml_table STORED AS MZML LOCATION '$CARGO_MANIFEST_DIR/test-data/datasources/mzml-pyoteomics/pyoteomics.mzML' +statement ok +SELECT * FROM mzml_table LIMIT 1; + query I SELECT contains_peak(mz.mz, 200.0, 1.0) AS has_peak FROM mzml_table LIMIT 1; ---- diff --git a/exon/exon-mzml/src/array_builder.rs b/exon/exon-mzml/src/array_builder.rs index e5d1ee01..3ea96437 100644 --- a/exon/exon-mzml/src/array_builder.rs +++ b/exon/exon-mzml/src/array_builder.rs @@ -17,7 +17,7 @@ use std::sync::Arc; use arrow::{ array::{ ArrayBuilder, ArrayRef, Float64Builder, GenericListBuilder, GenericStringBuilder, - ListBuilder, MapBuilder, StructBuilder, + Int64Builder, ListBuilder, StructBuilder, }, datatypes::{DataType, Field, Fields}, }; @@ -28,6 +28,8 @@ use crate::mzml_reader::{ NO_COMPRESSION_MS_NUMBER, WAVE_LENGTH_ARRAY, ZLIB_COMPRESSION_MS_NUMBER, }; +// https://github.com/wfondrie/depthcharge/blob/d46adf12deba06fb5d1019eb6e7a2ff621bfb388/depthcharge/data/parsers.py#L253 + use super::mzml_reader::binary_conversion::decode_binary_array; pub struct MzMLArrayBuilder { @@ -39,9 +41,11 @@ pub struct MzMLArrayBuilder { intensity: StructBuilder, wavelength: StructBuilder, - cv_params: MapBuilder, StructBuilder>, + // cv_params: MapBuilder, StructBuilder>, + cv_params: GenericListBuilder, - precursor_list: GenericListBuilder, + precursor_mz: Float64Builder, + precursor_charge: Int64Builder, } impl MzMLArrayBuilder { @@ -79,94 +83,22 @@ impl MzMLArrayBuilder { let wavelength_builder = StructBuilder::new(wavelength_fields, vec![Box::new(wavelength_array_builder)]); - let cv_params_builder = MapBuilder::new( - None, - GenericStringBuilder::::new(), - StructBuilder::new( - Fields::from(vec![ - Field::new("accession", DataType::Utf8, true), - Field::new("name", DataType::Utf8, true), - Field::new("value", DataType::Utf8, true), - ]), - vec![ - Box::new(GenericStringBuilder::::new()), - Box::new(GenericStringBuilder::::new()), - Box::new(GenericStringBuilder::::new()), - ], - ), - ); - - let cv_param_struct = Field::new( - "values", - DataType::Struct(Fields::from(vec![ + let cv_param_builder = StructBuilder::new( + Fields::from(vec![ Field::new("accession", DataType::Utf8, true), Field::new("name", DataType::Utf8, true), Field::new("value", DataType::Utf8, true), - ])), - true, - ); - - let cv_key_field = Field::new("keys", DataType::Utf8, false); - - // A map of cvParams to their values (DataType::Utf8 to cvParamStruct) - let isolation_window = Field::new_map( - "isolation_window", - "entries", - cv_key_field.clone(), - cv_param_struct.clone(), - false, - true, - ); - - let activation = Field::new_map( - "activation", - "entries", - cv_key_field, - cv_param_struct, - false, - true, + ]), + vec![ + Box::new(GenericStringBuilder::::new()), + Box::new(GenericStringBuilder::::new()), + Box::new(GenericStringBuilder::::new()), + ], ); - let precursor_fields = Fields::from(vec![isolation_window, activation]); - - let isolation_window_builder = MapBuilder::new( - None, - GenericStringBuilder::::new(), - StructBuilder::new( - Fields::from(vec![ - Field::new("accession", DataType::Utf8, true), - Field::new("name", DataType::Utf8, true), - Field::new("value", DataType::Utf8, true), - ]), - vec![ - Box::new(GenericStringBuilder::::new()), - Box::new(GenericStringBuilder::::new()), - Box::new(GenericStringBuilder::::new()), - ], - ), - ); + let cv_params_builder = GenericListBuilder::new(cv_param_builder); - let activation = MapBuilder::new( - None, - GenericStringBuilder::::new(), - StructBuilder::new( - Fields::from(vec![ - Field::new("accession", DataType::Utf8, true), - Field::new("name", DataType::Utf8, true), - Field::new("value", DataType::Utf8, true), - ]), - vec![ - Box::new(GenericStringBuilder::::new()), - Box::new(GenericStringBuilder::::new()), - Box::new(GenericStringBuilder::::new()), - ], - ), - ); - - let precursor_builder = StructBuilder::new( - precursor_fields, - vec![Box::new(isolation_window_builder), Box::new(activation)], - ); + let precursor_mz = Float64Builder::new(); Self { id: GenericStringBuilder::::new(), @@ -174,10 +106,9 @@ impl MzMLArrayBuilder { mz: mz_builder, intensity: intensity_builder, wavelength: wavelength_builder, - cv_params: cv_params_builder, - - precursor_list: GenericListBuilder::new(precursor_builder), + precursor_mz, + precursor_charge: Int64Builder::new(), } } @@ -400,8 +331,6 @@ impl MzMLArrayBuilder { self.id.append_value(&record.id); for cv_param in &record.cv_param { - self.cv_params.keys().append_value(&cv_param.accession); - self.cv_params .values() .field_builder::>(0) @@ -428,104 +357,58 @@ impl MzMLArrayBuilder { self.cv_params.values().append(true); } - self.cv_params.append(true).unwrap(); + self.cv_params.append(true); self.append_data_arrays(record)?; - let precursor_list_values = self.precursor_list.values(); - match &record.precursor_list { Some(precursor_list) => { - for precursor in &precursor_list.precursor { - let isolation_window_builder = precursor_list_values - .field_builder::, StructBuilder>>(0) - .unwrap(); - - match &precursor.isolation_window { - Some(isolation_window) => { - for cv_param in &isolation_window.cv_param { - isolation_window_builder - .keys() - .append_value(&cv_param.accession); - - isolation_window_builder - .values() - .field_builder::>(0) - .unwrap() - .append_value(&cv_param.accession); - - isolation_window_builder - .values() - .field_builder::>(1) - .unwrap() - .append_value(&cv_param.name); - - let cv_value = cv_param.value.as_ref().map(|v| v.to_string()); - isolation_window_builder - .values() - .field_builder::>(2) - .unwrap() - .append_option(cv_value); - - isolation_window_builder.values().append(true); - } - isolation_window_builder.append(true).unwrap(); - } - None => { - isolation_window_builder.append(false).unwrap(); + let precursor = &precursor_list.precursor[0]; + let selected_ion = &precursor.selected_ion_list.selected_ion[0]; + + let selected_ion_mz = selected_ion.cv_param.iter().find_map(|f| { + if f.accession == "MS:1000744" { + if let Some(value) = &f.value { + let string_value = value.to_string(); + let float_value = string_value.parse::().unwrap(); + + Some(float_value) + } else { + None } - }; - - let activation_builder = precursor_list_values - .field_builder::, StructBuilder>>(1) - .unwrap(); - - for cv_param in &precursor.activation.cv_param { - activation_builder.keys().append_value(&cv_param.accession); - - activation_builder - .values() - .field_builder::>(0) - .unwrap() - .append_value(&cv_param.accession); - - activation_builder - .values() - .field_builder::>(1) - .unwrap() - .append_value(&cv_param.name); - - activation_builder - .values() - .field_builder::>(2) - .unwrap() - .append_null(); - - activation_builder.values().append(true); + } else { + None } + }); - activation_builder.append(true).unwrap(); + if let Some(selected_ion_mz) = selected_ion_mz { + self.precursor_mz.append_value(selected_ion_mz); + } else { + self.precursor_mz.append_null(); } - precursor_list_values.append(true); - self.precursor_list.append(true); + let charge_state = selected_ion.cv_param.iter().find_map(|f| { + if f.accession == "MS:1000041" { + if let Some(value) = &f.value { + Some(value) + } else { + None + } + } else { + None + } + }); + + if let Some(charge_state) = charge_state { + let charge_state = charge_state.parse::().unwrap(); + self.precursor_charge.append_value(charge_state); + } else { + self.precursor_charge.append_null(); + } } None => { - let isolation_window_builder = precursor_list_values - .field_builder::, StructBuilder>>(0) - .unwrap(); - - isolation_window_builder.append(true).unwrap(); - - let activation_builder = precursor_list_values - .field_builder::, StructBuilder>>(1) - .unwrap(); - - activation_builder.append(true).unwrap(); - - precursor_list_values.append_null(); - - self.precursor_list.append_null(); + self.precursor_mz.append_null(); + self.precursor_charge.append_null(); } } @@ -540,7 +423,8 @@ impl MzMLArrayBuilder { let cv_params = self.cv_params.finish(); - let precursor_list = self.precursor_list.finish(); + let precursor_mz = self.precursor_mz.finish(); + let precursor_charge = self.precursor_charge.finish(); vec![ Arc::new(id), @@ -548,7 +432,8 @@ impl MzMLArrayBuilder { Arc::new(intensity), Arc::new(wavelength), Arc::new(cv_params), - Arc::new(precursor_list), + Arc::new(precursor_mz), + Arc::new(precursor_charge), ] } } diff --git a/exon/exon-mzml/src/batch_reader.rs b/exon/exon-mzml/src/batch_reader.rs index 9a0beb40..302a2814 100644 --- a/exon/exon-mzml/src/batch_reader.rs +++ b/exon/exon-mzml/src/batch_reader.rs @@ -71,6 +71,7 @@ where let batch: RecordBatch = RecordBatch::try_new(self.config.file_schema.clone(), array_builder.finish())?; + match &self.config.projection { Some(projection) => { let projected_batch = batch.project(projection)?; diff --git a/exon/exon-mzml/src/config.rs b/exon/exon-mzml/src/config.rs index 12234f8a..f4ee308b 100644 --- a/exon/exon-mzml/src/config.rs +++ b/exon/exon-mzml/src/config.rs @@ -125,60 +125,23 @@ fn file_fields() -> Vec { true, ); - let cv_params_field = Field::new_map( + let cv_params_field = Field::new( "cv_params", - "entries", - Field::new("keys", DataType::Utf8, false), - Field::new( - "values", - DataType::Struct(Fields::from(vec![ - Field::new("accession", DataType::Utf8, true), - Field::new("name", DataType::Utf8, true), - Field::new("value", DataType::Utf8, true), - ])), + DataType::List(Arc::new(Field::new( + "item", + cv_param_struct.data_type().clone(), true, - ), - false, + ))), true, ); - let cv_key_field = Field::new("keys", DataType::Utf8, false); - - // A map of cvParams to their values (DataType::Utf8 to cvParamStruct) - let isolation_window = Field::new_map( - "isolation_window", - "entries", - cv_key_field.clone(), - cv_param_struct.clone(), - false, - true, - ); - - let activation = Field::new_map( - "activation", - "entries", - cv_key_field, - cv_param_struct, - false, - true, - ); - - // A precursor is a struct - let precursor = Field::new( - "item", - DataType::Struct(Fields::from(vec![isolation_window, activation])), - true, - ); - - // A precursor list is a list of precursors - let precursor_list = Field::new("precursor_list", DataType::List(Arc::new(precursor)), true); - vec![ Field::new("id", DataType::Utf8, false), mz_field, intensity_field, wavelength_field, cv_params_field, - precursor_list, + Field::new("precursor_mz", DataType::Float64, true), + Field::new("precusor_charge", DataType::Int64, true), ] }