From e25f47196d454cf0fc15b75b76609ee8d207f8b1 Mon Sep 17 00:00:00 2001 From: Jack Huey <31162821+jackh726@users.noreply.github.com> Date: Thu, 22 Feb 2024 21:18:21 -0500 Subject: [PATCH 01/30] Extract out a BBIFile trait for pybigtools and use it in start_end --- pybigtools/src/lib.rs | 46 ++++++++++++++++++++++++++++++++----------- 1 file changed, 34 insertions(+), 12 deletions(-) diff --git a/pybigtools/src/lib.rs b/pybigtools/src/lib.rs index 75386b8..ec66738 100644 --- a/pybigtools/src/lib.rs +++ b/pybigtools/src/lib.rs @@ -13,10 +13,10 @@ use bigtools::utils::misc::{ bigwig_average_over_bed, BigWigAverageOverBedEntry, BigWigAverageOverBedError, Name, }; use bigtools::{ - BBIRead, BedEntry, BigBedRead as BigBedReadRaw, BigBedWrite as BigBedWriteRaw, + BBIReadError, BedEntry, BigBedRead as BigBedReadRaw, BigBedWrite as BigBedWriteRaw, BigWigRead as BigWigReadRaw, BigWigWrite as BigWigWriteRaw, CachedBBIFileRead, Value, + ZoomRecord, }; -use bigtools::{BBIReadError, ZoomRecord}; use bigtools::utils::reopen::Reopen; use file_like::PyFileLikeObject; @@ -34,13 +34,13 @@ mod file_like; type ValueTuple = (u32, u32, f32); type BedEntryTuple = (u32, u32, String); -fn start_end( - bigwig: &B, +fn start_end( + bbi: &B, chrom_name: &str, start: Option, end: Option, ) -> PyResult<(u32, u32)> { - let chrom = bigwig.chroms().into_iter().find(|x| x.name == chrom_name); + let chrom = bbi.chroms().into_iter().find(|x| x.name == chrom_name); let length = match chrom { None => { return Err(PyErr::new::(format!( @@ -59,6 +59,32 @@ impl Reopen for PyFileLikeObject { } } +trait BBIFile { + fn chroms(&self) -> &[bigtools::ChromInfo]; +} + +impl BBIFile for BigWigRead { + fn chroms(&self) -> &[bigtools::ChromInfo] { + match &self.bigwig { + BigWigRaw::File(b) => b.chroms(), + #[cfg(feature = "remote")] + BigWigRaw::Remote(b) => b.chroms(), + BigWigRaw::FileLike(b) => b.chroms(), + } + } +} + +impl BBIFile for BigBedRead { + fn chroms(&self) -> &[bigtools::ChromInfo] { + match &self.bigbed { + BigBedRaw::File(b) => b.chroms(), + #[cfg(feature = "remote")] + BigBedRaw::Remote(b) => b.chroms(), + BigBedRaw::FileLike(b) => b.chroms(), + } + } +} + enum BigWigRaw { File(BigWigReadRaw>), #[cfg(feature = "remote")] @@ -89,9 +115,9 @@ impl BigWigRead { start: Option, end: Option, ) -> PyResult { + let (start, end) = start_end(self, &chrom, start, end)?; match &self.bigwig { BigWigRaw::File(b) => { - let (start, end) = start_end(b, &chrom, start, end)?; let b = b.reopen()?; Ok(IntervalIterator { iter: Box::new(b.get_interval_move(&chrom, start, end).unwrap()), @@ -99,14 +125,12 @@ impl BigWigRead { } #[cfg(feature = "remote")] BigWigRaw::Remote(b) => { - let (start, end) = start_end(b, &chrom, start, end)?; let b = b.reopen()?; Ok(IntervalIterator { iter: Box::new(b.get_interval_move(&chrom, start, end).unwrap()), }) } BigWigRaw::FileLike(b) => { - let (start, end) = start_end(b, &chrom, start, end)?; let b = b.reopen()?; Ok(IntervalIterator { iter: Box::new(b.get_interval_move(&chrom, start, end).unwrap()), @@ -272,9 +296,9 @@ impl BigWigRead { ))); } }; + let (start, end) = start_end(self, &chrom, start, end)?; macro_rules! to_array { ($b:ident) => {{ - let (start, end) = start_end($b, &chrom, start, end)?; match bins { Some(bins) if !exact.unwrap_or(false) => { let max_zoom_size = ((end - start) as f32 / (bins * 2) as f32) as u32; @@ -630,9 +654,9 @@ impl BigBedRead { start: Option, end: Option, ) -> PyResult { + let (start, end) = start_end(self, &chrom, start, end)?; match &mut self.bigbed { BigBedRaw::File(b) => { - let (start, end) = start_end(b, &chrom, start, end)?; let b = b.reopen()?; Ok(EntriesIterator { iter: Box::new(b.get_interval_move(&chrom, start, end).unwrap()), @@ -640,14 +664,12 @@ impl BigBedRead { } #[cfg(feature = "remote")] BigBedRaw::Remote(b) => { - let (start, end) = start_end(b, &chrom, start, end)?; let b = b.reopen()?; Ok(EntriesIterator { iter: Box::new(b.get_interval_move(&chrom, start, end).unwrap()), }) } BigBedRaw::FileLike(b) => { - let (start, end) = start_end(b, &chrom, start, end)?; let b = b.reopen()?; Ok(EntriesIterator { iter: Box::new(b.get_interval_move(&chrom, start, end).unwrap()), From d881b5e34abe27d36291668b8d6b0843fafa70d5 Mon Sep 17 00:00:00 2001 From: Jack Huey <31162821+jackh726@users.noreply.github.com> Date: Sun, 3 Mar 2024 23:31:14 -0500 Subject: [PATCH 02/30] Merge BigWigRead and BigBedRead into BBIRead, add sql fn --- bigtools/src/bed/autosql.rs | 158 ++++--- pybigtools/src/lib.rs | 793 ++++++++++++++++++++---------------- 2 files changed, 553 insertions(+), 398 deletions(-) diff --git a/bigtools/src/bed/autosql.rs b/bigtools/src/bed/autosql.rs index 922c7f8..b5f33f8 100644 --- a/bigtools/src/bed/autosql.rs +++ b/bigtools/src/bed/autosql.rs @@ -356,6 +356,62 @@ pub mod parse { pub auto: bool, } + impl DeclareName { + fn parse(parser: &mut parser::Parser<'_>) -> Result { + let declare_name = parser.eat_word(); + if !declare_name.chars().next().unwrap_or(' ').is_alphabetic() + || declare_name.chars().any(|c| !c.is_alphanumeric()) + { + return Err(ParseError::InvalidDeclareName(declare_name.to_string())); + } + let declare_name = declare_name.to_string(); + + let next_word = parser.peek_word(); + let index_type = match next_word { + "primary" => { + parser.eat_word(); + Some(IndexType::Primary) + } + "index" => { + parser.eat_word(); + + let next = parser.peek_one(); + let size = if next == "[" { + parser.eat_one(); + let size = parser.eat_word().to_string(); + let close = parser.eat_one(); + if close != "]" { + return Err(ParseError::InvalidIndexSizeBrackets(close.to_string())); + } + Some(size) + } else { + None + }; + Some(IndexType::Index(size)) + } + "unique" => { + parser.eat_word(); + Some(IndexType::Unique) + } + "auto" => None, + _ => None, + }; + + let next_word = parser.peek_word(); + let auto = if next_word == "auto" { + parser.eat_word(); + true + } else { + false + }; + Ok(DeclareName { + name: declare_name, + index_type, + auto, + }) + } + } + #[derive(Clone, Debug)] pub struct Declaration { pub declaration_type: DeclarationType, @@ -445,7 +501,7 @@ pub mod parse { } "simple" => { parser.take(); - let declare_name = parse_declare_name(parser)?; + let declare_name = DeclareName::parse(parser)?; return Ok(Some(FieldType::Declaration( DeclarationType::Simple, declare_name, @@ -453,7 +509,7 @@ pub mod parse { } "object" => { parser.take(); - let declare_name = parse_declare_name(parser)?; + let declare_name = DeclareName::parse(parser)?; return Ok(Some(FieldType::Declaration( DeclarationType::Object, declare_name, @@ -461,7 +517,7 @@ pub mod parse { } "table" => { parser.take(); - let declare_name = parse_declare_name(parser)?; + let declare_name = DeclareName::parse(parser)?; return Ok(Some(FieldType::Declaration( DeclarationType::Object, declare_name, @@ -472,6 +528,46 @@ pub mod parse { parser.take(); return Ok(Some(field_type)); } + + /// This is currently incomplete when printing `simple`, `table`, and `object`. + pub fn to_string(&self) -> String { + match self { + FieldType::Int => "int".to_string(), + FieldType::Uint => "uint".to_string(), + FieldType::Short => "short".to_string(), + FieldType::Ushort => "ushort".to_string(), + FieldType::Byte => "byte".to_string(), + FieldType::Ubyte => "ubyte".to_string(), + FieldType::Float => "float".to_string(), + FieldType::Double => "double".to_string(), + FieldType::Char => "char".to_string(), + FieldType::String => "string".to_string(), + FieldType::Lstring => "lstring".to_string(), + FieldType::Bigint => "bigint".to_string(), + FieldType::Enum(values) => { + let mut e = "enum(".to_string(); + for v in values { + e.push_str(v) + } + e.push(')'); + e + } + FieldType::Set(values) => { + let mut e = "set(".to_string(); + for v in values { + e.push_str(v) + } + e.push(')'); + e + } + FieldType::Declaration(decl_type, _decl_name) => match decl_type { + DeclarationType::Simple => "simple ...", + DeclarationType::Object => "object ...", + DeclarationType::Table => "table ...", + } + .to_string(), + } + } } #[derive(Clone, Debug)] @@ -523,7 +619,7 @@ pub mod parse { _ => return Err(ParseError::InvalidDeclareType(declare_type.to_string())), }; - let declare_name = parse_declare_name(parser)?; + let declare_name = DeclareName::parse(parser)?; let comment = parser.eat_quoted_string().to_string(); @@ -553,60 +649,6 @@ pub mod parse { })) } - fn parse_declare_name(parser: &mut parser::Parser<'_>) -> Result { - let declare_name = parser.eat_word(); - if !declare_name.chars().next().unwrap_or(' ').is_alphabetic() - || declare_name.chars().any(|c| !c.is_alphanumeric()) - { - return Err(ParseError::InvalidDeclareName(declare_name.to_string())); - } - let declare_name = declare_name.to_string(); - - let next_word = parser.peek_word(); - let index_type = match next_word { - "primary" => { - parser.eat_word(); - Some(IndexType::Primary) - } - "index" => { - parser.eat_word(); - - let next = parser.peek_one(); - let size = if next == "[" { - parser.eat_one(); - let size = parser.eat_word().to_string(); - let close = parser.eat_one(); - if close != "]" { - return Err(ParseError::InvalidIndexSizeBrackets(close.to_string())); - } - Some(size) - } else { - None - }; - Some(IndexType::Index(size)) - } - "unique" => { - parser.eat_word(); - Some(IndexType::Unique) - } - "auto" => None, - _ => None, - }; - - let next_word = parser.peek_word(); - let auto = if next_word == "auto" { - parser.eat_word(); - true - } else { - false - }; - Ok(DeclareName { - name: declare_name, - index_type, - auto, - }) - } - fn parse_field_list(parser: &mut parser::Parser<'_>) -> Result, ParseError> { let mut fields = vec![]; loop { diff --git a/pybigtools/src/lib.rs b/pybigtools/src/lib.rs index ec66738..08f0d8b 100644 --- a/pybigtools/src/lib.rs +++ b/pybigtools/src/lib.rs @@ -4,6 +4,7 @@ use std::fs::File; use std::io::{self, BufReader}; use std::path::Path; +use bigtools::bed::autosql::parse::parse_autosql; use bigtools::bed::bedparser::BedParser; use bigtools::bedchromdata::BedParserStreamingIterator; #[cfg(feature = "remote")] @@ -13,9 +14,9 @@ use bigtools::utils::misc::{ bigwig_average_over_bed, BigWigAverageOverBedEntry, BigWigAverageOverBedError, Name, }; use bigtools::{ - BBIReadError, BedEntry, BigBedRead as BigBedReadRaw, BigBedWrite as BigBedWriteRaw, - BigWigRead as BigWigReadRaw, BigWigWrite as BigWigWriteRaw, CachedBBIFileRead, Value, - ZoomRecord, + BBIFileRead, BBIReadError, BedEntry, BigBedRead as BigBedReadRaw, + BigBedWrite as BigBedWriteRaw, BigWigRead as BigWigReadRaw, BigWigWrite as BigWigWriteRaw, + CachedBBIFileRead, Value, ZoomRecord, }; use bigtools::utils::reopen::Reopen; @@ -34,13 +35,23 @@ mod file_like; type ValueTuple = (u32, u32, f32); type BedEntryTuple = (u32, u32, String); -fn start_end( - bbi: &B, +fn start_end( + bbi: &BBIReadRaw, chrom_name: &str, start: Option, end: Option, ) -> PyResult<(u32, u32)> { - let chrom = bbi.chroms().into_iter().find(|x| x.name == chrom_name); + let chroms = match &bbi { + BBIReadRaw::BigWigFile(b) => b.chroms(), + #[cfg(feature = "remote")] + BBIReadRaw::BigWigRemote(b) => b.chroms(), + BBIReadRaw::BigWigFileLike(b) => b.chroms(), + BBIReadRaw::BigBedFile(b) => b.chroms(), + #[cfg(feature = "remote")] + BBIReadRaw::BigBedRemote(b) => b.chroms(), + BBIReadRaw::BigBedFileLike(b) => b.chroms(), + }; + let chrom = chroms.into_iter().find(|x| x.name == chrom_name); let length = match chrom { None => { return Err(PyErr::new::(format!( @@ -59,88 +70,175 @@ impl Reopen for PyFileLikeObject { } } -trait BBIFile { - fn chroms(&self) -> &[bigtools::ChromInfo]; -} - -impl BBIFile for BigWigRead { - fn chroms(&self) -> &[bigtools::ChromInfo] { - match &self.bigwig { - BigWigRaw::File(b) => b.chroms(), - #[cfg(feature = "remote")] - BigWigRaw::Remote(b) => b.chroms(), - BigWigRaw::FileLike(b) => b.chroms(), - } - } -} - -impl BBIFile for BigBedRead { - fn chroms(&self) -> &[bigtools::ChromInfo] { - match &self.bigbed { - BigBedRaw::File(b) => b.chroms(), - #[cfg(feature = "remote")] - BigBedRaw::Remote(b) => b.chroms(), - BigBedRaw::FileLike(b) => b.chroms(), - } - } -} - -enum BigWigRaw { - File(BigWigReadRaw>), +enum BBIReadRaw { + BigWigFile(BigWigReadRaw>), + #[cfg(feature = "remote")] + BigWigRemote(BigWigReadRaw>), + BigWigFileLike(BigWigReadRaw>), + BigBedFile(BigBedReadRaw>), #[cfg(feature = "remote")] - Remote(BigWigReadRaw>), - FileLike(BigWigReadRaw>), + BigBedRemote(BigBedReadRaw>), + BigBedFileLike(BigBedReadRaw>), } -/// This class is the interface for reading a bigWig. +/// This class is the interface for reading a bigWig or bigBed. #[pyclass(module = "pybigtools")] -struct BigWigRead { - bigwig: BigWigRaw, +struct BBIRead { + bbi: BBIReadRaw, } #[pymethods] -impl BigWigRead { - /// Returns the intervals of a given range on a chromosome. The result is an iterator of (int, int, float) in the format (start, end, value). - /// The intervals may not be contiguous if the values in the bigwig are not. +impl BBIRead { + /// Returns the autosql of this bbi file. /// - /// The chrom argument is the name of the chromosome. - /// The start and end arguments denote the range to get values for. - /// If end is not provided, it defaults to the length of the chromosome. - /// If start is not provided, it defaults to the beginning of the chromosome. + /// For bigBeds, this comes directly from the autosql stored in the file. + /// For bigWigs, the autosql returned matches that of a bedGraph file. + /// + /// By default, the autosql is returned as a string. Passing `parse = true` + /// returns instead a dictionary of the format: + /// ``` + /// { + /// "name": , + /// "comment": , + /// "fields": [(, , ), ...], + /// } + /// ``` + fn sql(&mut self, py: Python, parse: Option) -> PyResult { + let parse = parse.unwrap_or(false); + pub const BEDGRAPH: &str = r#" + table bedGraph + "bedGraph file" + ( + string chrom; "Reference sequence chromosome or scaffold" + uint chromStart; "Start position in chromosome" + uint chromEnd; "End position in chromosome" + float value; "Value for a given interval" + ) + "#; + let schema = match &mut self.bbi { + BBIReadRaw::BigWigFile(_) + | BBIReadRaw::BigWigRemote(_) + | BBIReadRaw::BigWigFileLike(_) => BEDGRAPH.to_string(), + BBIReadRaw::BigBedFile(b) => b + .autosql() + .map_err(|e| PyErr::new::(format!("{}", e)))?, + BBIReadRaw::BigBedRemote(b) => b + .autosql() + .map_err(|e| PyErr::new::(format!("{}", e)))?, + BBIReadRaw::BigBedFileLike(b) => b + .autosql() + .map_err(|e| PyErr::new::(format!("{}", e)))?, + }; + let obj = if parse { + let mut declarations = parse_autosql(&schema).map_err(|_| { + PyErr::new::("Unable to parse autosql.") + })?; + if declarations.len() > 1 { + return Err(PyErr::new::( + "Unexpected extra declarations.", + )); + } + let declaration = declarations.pop(); + match declaration { + None => PyDict::new(py).to_object(py), + Some(d) => { + let fields = d + .fields + .iter() + .map(|f| (&f.name, f.field_type.to_string(), &f.comment)) + .collect::>() + .to_object(py); + [ + ("name", d.name.name.to_object(py)), + ("comment", d.comment.to_object(py)), + ("fields", fields), + ] + .into_py_dict(py) + .to_object(py) + } + } + } else { + schema.to_object(py) + }; + Ok(obj) + } + + /// Returns the records of a given range on a chromosome. + /// + /// The result is an iterator of tuples. For bigWigs, these tuples are in + /// the format (start: int, end: int, value: float). For bigBeds, these + /// tuples are in the format (start: int, end: int, ...), where the "rest" + /// fields are split by whitespace. /// - /// This returns an `IntervalIterator`. - fn intervals( + /// Missing values in bigWigs will results in non-contiguous records. + /// + /// The chrom argument is the name of the chromosome. + /// The start and end arguments denote the range to get values for. + /// If end is not provided, it defaults to the length of the chromosome. + /// If start is not provided, it defaults to the beginning of the chromosome. + fn records( &mut self, + py: Python<'_>, chrom: String, start: Option, end: Option, - ) -> PyResult { - let (start, end) = start_end(self, &chrom, start, end)?; - match &self.bigwig { - BigWigRaw::File(b) => { + ) -> PyResult { + let (start, end) = start_end(&self.bbi, &chrom, start, end)?; + match &self.bbi { + BBIReadRaw::BigWigFile(b) => { let b = b.reopen()?; - Ok(IntervalIterator { + Ok(BigWigIntervalIterator { iter: Box::new(b.get_interval_move(&chrom, start, end).unwrap()), - }) + } + .into_py(py)) } #[cfg(feature = "remote")] - BigWigRaw::Remote(b) => { + BBIReadRaw::BigWigRemote(b) => { let b = b.reopen()?; - Ok(IntervalIterator { + Ok(BigWigIntervalIterator { iter: Box::new(b.get_interval_move(&chrom, start, end).unwrap()), - }) + } + .into_py(py)) } - BigWigRaw::FileLike(b) => { + BBIReadRaw::BigWigFileLike(b) => { let b = b.reopen()?; - Ok(IntervalIterator { + Ok(BigWigIntervalIterator { iter: Box::new(b.get_interval_move(&chrom, start, end).unwrap()), - }) + } + .into_py(py)) + } + BBIReadRaw::BigBedFile(b) => { + let b = b.reopen()?; + Ok(BigBedEntriesIterator { + iter: Box::new(b.get_interval_move(&chrom, start, end).unwrap()), + } + .into_py(py)) + } + #[cfg(feature = "remote")] + BBIReadRaw::BigBedRemote(b) => { + let b = b.reopen()?; + Ok(BigBedEntriesIterator { + iter: Box::new(b.get_interval_move(&chrom, start, end).unwrap()), + } + .into_py(py)) + } + BBIReadRaw::BigBedFileLike(b) => { + let b = b.reopen()?; + Ok(BigBedEntriesIterator { + iter: Box::new(b.get_interval_move(&chrom, start, end).unwrap()), + } + .into_py(py)) } } } - /// Returns the values of a given range on a chromosome. The result is an iterator of floats, of length (end - start). - /// If a value does not exist in the bigwig for a specific base, it will be nan. + /// Returns the values of a given range on a chromosome. + /// + /// For bigWigs, the result is an array of length (end - start). + /// If a value does not exist in the bigwig for a specific base, it will be nan. + /// + /// For bigBeds, the returned array instead represents a pileup of the count + /// of intervals overlapping each base. /// /// The chrom argument is the name of the chromosome. /// The start and end arguments denote the range to get values for. @@ -284,6 +382,65 @@ impl BigWigRead { } Ok(Array::from(v)) } + fn to_entry_array>>( + start: usize, + end: usize, + iter: I, + ) -> Result, BBIReadError> { + use numpy::ndarray::Array; + let mut v = vec![0.0; end - start]; + for interval in iter { + let interval = interval?; + let interval_start = (interval.start as usize) - start; + let interval_end = (interval.end as usize) - start; + for i in v[interval_start..interval_end].iter_mut() { + *i += 1.0; + } + } + Ok(Array::from(v)) + } + fn to_array_entry_bins>>( + start: usize, + end: usize, + iter: I, + bins: usize, + ) -> Result, BBIReadError> { + use numpy::ndarray::Array; + let mut v = vec![0.0; bins]; + let bin_size = (end - start) as f64 / bins as f64; + for interval in iter { + let interval = interval?; + let interval_start = (interval.start as usize) - start; + let interval_end = (interval.end as usize) - start; + let bin_start = ((interval_start as f64) / bin_size) as usize; + let bin_end = ((interval_end as f64) / bin_size).ceil() as usize; + for bin in bin_start..bin_end { + v[bin] += 1.0; + } + } + Ok(Array::from(v)) + } + fn to_entry_array_zoom>>( + start: usize, + end: usize, + iter: I, + bins: usize, + ) -> Result, BBIReadError> { + use numpy::ndarray::Array; + let mut v = vec![0.0; bins]; + let bin_size = (end - start) as f64 / bins as f64; + for interval in iter { + let interval = interval?; + let interval_start = (interval.start as usize).max(start) - start; + let interval_end = (interval.end as usize).min(end) - start; + let bin_start = ((interval_start as f64) / bin_size) as usize; + let bin_end = ((interval_end as f64) / bin_size).ceil() as usize; + for bin in bin_start..bin_end { + v[bin] += interval.summary.total_items as f64; + } + } + Ok(Array::from(v)) + } let summary = match summary.as_deref() { None => Summary::Mean, @@ -296,27 +453,35 @@ impl BigWigRead { ))); } }; - let (start, end) = start_end(self, &chrom, start, end)?; - macro_rules! to_array { - ($b:ident) => {{ - match bins { - Some(bins) if !exact.unwrap_or(false) => { - let max_zoom_size = ((end - start) as f32 / (bins * 2) as f32) as u32; - let zoom = ($b) - .info() - .zoom_headers - .iter() - .filter(|z| z.reduction_level <= max_zoom_size) - .min_by_key(|z| max_zoom_size - z.reduction_level); - match zoom { - Some(zoom) => { - let iter = ($b) - .get_zoom_interval(&chrom, start, end, zoom.reduction_level) - .map_err(|e| { - PyErr::new::(format!("{}", e)) - })?; - Python::with_gil(|py| { - Ok(to_array_zoom( + let (start, end) = start_end(&self.bbi, &chrom, start, end)?; + fn intervals_to_array( + b: &mut BigWigReadRaw, + chrom: &str, + start: u32, + end: u32, + bins: Option, + summary: Summary, + exact: Option, + ) -> PyResult { + match bins { + Some(bins) if !exact.unwrap_or(false) => { + let max_zoom_size = ((end - start) as f32 / (bins * 2) as f32) as u32; + let zoom = b + .info() + .zoom_headers + .iter() + .filter(|z| z.reduction_level <= max_zoom_size) + .min_by_key(|z| max_zoom_size - z.reduction_level); + match zoom { + Some(zoom) => { + let iter = b + .get_zoom_interval(&chrom, start, end, zoom.reduction_level) + .map_err(|e| { + PyErr::new::(format!("{}", e)) + })?; + Python::with_gil(|py| { + Ok( + to_array_zoom( start as usize, end as usize, iter, @@ -327,72 +492,166 @@ impl BigWigRead { PyErr::new::(format!("{}", e)) })? .into_pyarray(py) - .to_object(py)) - }) - } - None => { - let iter = ($b).get_interval(&chrom, start, end).map_err(|e| { - PyErr::new::(format!("{}", e)) - })?; - Python::with_gil(|py| { - Ok(to_array_bins( + .to_object(py), + ) + }) + } + None => { + let iter = b.get_interval(&chrom, start, end).map_err(|e| { + PyErr::new::(format!("{}", e)) + })?; + Python::with_gil(|py| { + Ok( + to_array_bins( start as usize, end as usize, iter, summary, bins, ) - .map_err(|e| { - PyErr::new::(format!("{}", e)) - })? - .into_pyarray(py) - .to_object(py)) - }) - } - } - } - Some(bins) => { - let iter = ($b).get_interval(&chrom, start, end).map_err(|e| { - PyErr::new::(format!("{}", e)) - })?; - Python::with_gil(|py| { - Ok( - to_array_bins(start as usize, end as usize, iter, summary, bins) .map_err(|e| { PyErr::new::(format!("{}", e)) })? .into_pyarray(py) .to_object(py), - ) - }) + ) + }) + } } - _ => { - let iter = ($b).get_interval(&chrom, start, end).map_err(|e| { - PyErr::new::(format!("{}", e)) - })?; - Python::with_gil(|py| { - Ok(to_array(start as usize, end as usize, iter) + } + Some(bins) => { + let iter = b + .get_interval(&chrom, start, end) + .map_err(|e| PyErr::new::(format!("{}", e)))?; + Python::with_gil(|py| { + Ok( + to_array_bins(start as usize, end as usize, iter, summary, bins) .map_err(|e| { PyErr::new::(format!("{}", e)) })? .into_pyarray(py) - .to_object(py)) - }) + .to_object(py), + ) + }) + } + _ => { + let iter = b + .get_interval(&chrom, start, end) + .map_err(|e| PyErr::new::(format!("{}", e)))?; + Python::with_gil(|py| { + Ok(to_array(start as usize, end as usize, iter) + .map_err(|e| { + PyErr::new::(format!("{}", e)) + })? + .into_pyarray(py) + .to_object(py)) + }) + } + } + } + fn entries_to_array( + b: &mut BigBedReadRaw, + chrom: &str, + start: u32, + end: u32, + bins: Option, + exact: Option, + ) -> PyResult { + match bins { + Some(bins) if !exact.unwrap_or(false) => { + let max_zoom_size = ((end - start) as f32 / (bins * 2) as f32) as u32; + let zoom = b + .info() + .zoom_headers + .iter() + .filter(|z| z.reduction_level <= max_zoom_size) + .min_by_key(|z| max_zoom_size - z.reduction_level); + match zoom { + Some(zoom) => { + let iter = b + .get_zoom_interval(&chrom, start, end, zoom.reduction_level) + .map_err(|e| { + PyErr::new::(format!("{}", e)) + })?; + Python::with_gil(|py| { + Ok( + to_entry_array_zoom(start as usize, end as usize, iter, bins) + .map_err(|e| { + PyErr::new::(format!( + "{}", + e + )) + })? + .into_pyarray(py) + .to_object(py), + ) + }) + } + None => { + let iter = b.get_interval(&chrom, start, end).map_err(|e| { + PyErr::new::(format!("{}", e)) + })?; + Python::with_gil(|py| { + Ok( + to_array_entry_bins(start as usize, end as usize, iter, bins) + .map_err(|e| { + PyErr::new::(format!( + "{}", + e + )) + })? + .into_pyarray(py) + .to_object(py), + ) + }) + } } } - }}; + Some(bins) => { + let iter = b + .get_interval(&chrom, start, end) + .map_err(|e| PyErr::new::(format!("{}", e)))?; + Python::with_gil(|py| { + Ok( + to_array_entry_bins(start as usize, end as usize, iter, bins) + .map_err(|e| { + PyErr::new::(format!("{}", e)) + })? + .into_pyarray(py) + .to_object(py), + ) + }) + } + _ => { + let iter = b + .get_interval(&chrom, start, end) + .map_err(|e| PyErr::new::(format!("{}", e)))?; + Python::with_gil(|py| { + Ok(to_entry_array(start as usize, end as usize, iter) + .map_err(|e| { + PyErr::new::(format!("{}", e)) + })? + .into_pyarray(py) + .to_object(py)) + }) + } + } } - match &mut self.bigwig { - BigWigRaw::File(b) => { - to_array!(b) + match &mut self.bbi { + BBIReadRaw::BigWigFile(b) => { + intervals_to_array(b, &chrom, start, end, bins, summary, exact) } #[cfg(feature = "remote")] - BigWigRaw::Remote(b) => { - to_array!(b) + BBIReadRaw::BigWigRemote(b) => { + intervals_to_array(b, &chrom, start, end, bins, summary, exact) } - BigWigRaw::FileLike(b) => { - to_array!(b) + BBIReadRaw::BigWigFileLike(b) => { + intervals_to_array(b, &chrom, start, end, bins, summary, exact) } + BBIReadRaw::BigBedFile(b) => entries_to_array(b, &chrom, start, end, bins, exact), + #[cfg(feature = "remote")] + BBIReadRaw::BigBedRemote(b) => entries_to_array(b, &chrom, start, end, bins, exact), + BBIReadRaw::BigBedFileLike(b) => entries_to_array(b, &chrom, start, end, bins, exact), } } @@ -402,63 +661,37 @@ impl BigWigRead { /// If it is None, then all chroms will be returned. /// If it is a String, then the length of that chromosome will be returned. /// If the chromosome doesn't exist, nothing will be returned. - fn chroms(&mut self, py: Python, chrom: Option) -> PyResult> { - match &self.bigwig { - BigWigRaw::File(b) => { - let val = match chrom { - Some(chrom) => b - .chroms() + fn chroms(&mut self, py: Python, chrom: Option) -> Option { + fn get_chrom_obj( + b: &B, + py: Python, + chrom: Option, + ) -> Option { + match chrom { + Some(chrom) => b + .chroms() + .into_iter() + .find(|c| c.name == chrom) + .map(|c| c.length) + .map(|c| c.to_object(py)), + None => Some( + b.chroms() .into_iter() - .find(|c| c.name == chrom) - .map(|c| c.length) - .map(|c| c.to_object(py)), - None => Some( - b.chroms() - .into_iter() - .map(|c| (c.name.clone(), c.length)) - .into_py_dict(py) - .into(), - ), - }; - Ok(val) + .map(|c| (c.name.clone(), c.length)) + .into_py_dict(py) + .into(), + ), } + } + match &self.bbi { + BBIReadRaw::BigWigFile(b) => get_chrom_obj(b, py, chrom), #[cfg(feature = "remote")] - BigWigRaw::Remote(b) => { - let val = match chrom { - Some(chrom) => b - .chroms() - .into_iter() - .find(|c| c.name == chrom) - .map(|c| c.length) - .map(|c| c.to_object(py)), - None => Some( - b.chroms() - .into_iter() - .map(|c| (c.name.clone(), c.length)) - .into_py_dict(py) - .into(), - ), - }; - Ok(val) - } - BigWigRaw::FileLike(b) => { - let val = match chrom { - Some(chrom) => b - .chroms() - .into_iter() - .find(|c| c.name == chrom) - .map(|c| c.length) - .map(|c| c.to_object(py)), - None => Some( - b.chroms() - .into_iter() - .map(|c| (c.name.clone(), c.length)) - .into_py_dict(py) - .into(), - ), - }; - Ok(val) - } + BBIReadRaw::BigWigRemote(b) => get_chrom_obj(b, py, chrom), + BBIReadRaw::BigWigFileLike(b) => get_chrom_obj(b, py, chrom), + BBIReadRaw::BigBedFile(b) => get_chrom_obj(b, py, chrom), + #[cfg(feature = "remote")] + BBIReadRaw::BigBedRemote(b) => get_chrom_obj(b, py, chrom), + BBIReadRaw::BigBedFileLike(b) => get_chrom_obj(b, py, chrom), } } } @@ -467,13 +700,13 @@ impl BigWigRead { /// It returns only values that exist in the bigWig, skipping /// any missing intervals. #[pyclass(module = "pybigtools")] -struct IntervalIterator { +struct BigWigIntervalIterator { iter: Box> + Send>, } #[pymethods] -impl IntervalIterator { - fn __iter__(slf: PyRefMut) -> PyResult> { +impl BigWigIntervalIterator { + fn __iter__(slf: PyRefMut) -> PyResult> { Ok(slf.into()) } @@ -486,6 +719,27 @@ impl IntervalIterator { } } +/// This class is an interator for the entries in a bigBed file. +#[pyclass(module = "pybigtools")] +struct BigBedEntriesIterator { + iter: Box> + Send>, +} + +#[pymethods] +impl BigBedEntriesIterator { + fn __iter__(slf: PyRefMut) -> PyResult> { + Ok(slf.into()) + } + + fn __next__(mut slf: PyRefMut) -> PyResult> { + Ok(slf + .iter + .next() + .map(|e| e.unwrap()) + .map(|e| (e.start, e.end, e.rest))) + } +} + /// This class is the interface for writing a bigWig. #[pyclass(module = "pybigtools")] struct BigWigWrite { @@ -625,144 +879,6 @@ impl BigWigWrite { } } -enum BigBedRaw { - File(BigBedReadRaw>), - #[cfg(feature = "remote")] - Remote(BigBedReadRaw>), - FileLike(BigBedReadRaw>), -} - -/// This class is the interface for reading a bigBed. -#[pyclass(module = "pybigtools")] -struct BigBedRead { - bigbed: BigBedRaw, -} - -#[pymethods] -impl BigBedRead { - /// Returns the entries of a given range on a chromosome. The result is an iterator of (int, int, String) in the format (start, end, rest). - /// The entries may not be contiguous if the values in the bigbed are not. - /// The chrom argument is the name of the chromosome. - /// The start and end arguments denote the range to get values for. - /// If end is not provided, it defaults to the length of the chromosome. - /// If start is not provided, it defaults to the beginning of the chromosome. - /// - /// This returns an `EntriesIterator`. - fn entries( - &mut self, - chrom: String, - start: Option, - end: Option, - ) -> PyResult { - let (start, end) = start_end(self, &chrom, start, end)?; - match &mut self.bigbed { - BigBedRaw::File(b) => { - let b = b.reopen()?; - Ok(EntriesIterator { - iter: Box::new(b.get_interval_move(&chrom, start, end).unwrap()), - }) - } - #[cfg(feature = "remote")] - BigBedRaw::Remote(b) => { - let b = b.reopen()?; - Ok(EntriesIterator { - iter: Box::new(b.get_interval_move(&chrom, start, end).unwrap()), - }) - } - BigBedRaw::FileLike(b) => { - let b = b.reopen()?; - Ok(EntriesIterator { - iter: Box::new(b.get_interval_move(&chrom, start, end).unwrap()), - }) - } - } - } - - /// Returns the chromosomes in a bigwig, and their lengths. - /// The chroms argument can be either String or None. If it is None, then all chroms will be returned. If it is a String, then the length of that chromosome will be returned. - /// If the chromosome doesn't exist, nothing will be returned. - fn chroms(&mut self, py: Python, chrom: Option) -> PyResult> { - match &self.bigbed { - BigBedRaw::File(b) => { - let val = match chrom { - Some(chrom) => b - .chroms() - .into_iter() - .find(|c| c.name == chrom) - .map(|c| c.length) - .map(|c| c.to_object(py)), - None => Some( - b.chroms() - .into_iter() - .map(|c| (c.name.clone(), c.length)) - .into_py_dict(py) - .into(), - ), - }; - Ok(val) - } - #[cfg(feature = "remote")] - BigBedRaw::Remote(b) => { - let val = match chrom { - Some(chrom) => b - .chroms() - .into_iter() - .find(|c| c.name == chrom) - .map(|c| c.length) - .map(|c| c.to_object(py)), - None => Some( - b.chroms() - .into_iter() - .map(|c| (c.name.clone(), c.length)) - .into_py_dict(py) - .into(), - ), - }; - Ok(val) - } - BigBedRaw::FileLike(b) => { - let val = match chrom { - Some(chrom) => b - .chroms() - .into_iter() - .find(|c| c.name == chrom) - .map(|c| c.length) - .map(|c| c.to_object(py)), - None => Some( - b.chroms() - .into_iter() - .map(|c| (c.name.clone(), c.length)) - .into_py_dict(py) - .into(), - ), - }; - Ok(val) - } - } - } -} - -/// This class is an interator for the entries in a bigBed file. -#[pyclass(module = "pybigtools")] -struct EntriesIterator { - iter: Box> + Send>, -} - -#[pymethods] -impl EntriesIterator { - fn __iter__(slf: PyRefMut) -> PyResult> { - Ok(slf.into()) - } - - fn __next__(mut slf: PyRefMut) -> PyResult> { - Ok(slf - .iter - .next() - .map(|e| e.unwrap()) - .map(|e| (e.start, e.end, e.rest))) - } -} - /// This class is the interface for writing to a bigBed. #[pyclass(module = "pybigtools")] struct BigBedWrite { @@ -966,9 +1082,8 @@ impl BigWigAverageOverBedEntriesIterator { /// /// This returns one of the following: /// - `BigWigWrite` -/// - `BigWigRead` /// - `BigBedWrite` -/// - `BigBedRead` +/// - `BBIRead` #[pyfunction] fn open(py: Python, path_url_or_file_like: PyObject, mode: Option) -> PyResult { let iswrite = match &mode { @@ -1000,13 +1115,13 @@ fn open(py: Python, path_url_or_file_like: PyObject, mode: Option) -> Py ))), }; let read = match BigWigReadRaw::open(file_like.clone()) { - Ok(bwr) => BigWigRead { - bigwig: BigWigRaw::FileLike(bwr.cached()), + Ok(bwr) => BBIRead { + bbi: BBIReadRaw::BigWigFileLike(bwr.cached()), } .into_py(py), Err(_) => match BigBedReadRaw::open(file_like) { - Ok(bbr) => BigBedRead { - bigbed: BigBedRaw::FileLike(bbr.cached()), + Ok(bbr) => BBIRead { + bbi: BBIReadRaw::BigBedFileLike(bbr.cached()), } .into_py(py), Err(e) => { @@ -1071,8 +1186,8 @@ fn open_path_or_url( "bw" | "bigWig" | "bigwig" => { if isfile { match BigWigReadRaw::open_file(&path_url_or_file_like) { - Ok(bwr) => BigWigRead { - bigwig: BigWigRaw::File(bwr.cached()), + Ok(bwr) => BBIRead { + bbi: BBIReadRaw::BigWigFile(bwr.cached()), } .into_py(py), Err(_) => { @@ -1084,8 +1199,8 @@ fn open_path_or_url( } else { #[cfg(feature = "remote")] match BigWigReadRaw::open(RemoteFile::new(&path_url_or_file_like)) { - Ok(bwr) => BigWigRead { - bigwig: BigWigRaw::Remote(bwr.cached()), + Ok(bwr) => BBIRead { + bbi: BBIReadRaw::BigWigRemote(bwr.cached()), } .into_py(py), Err(_) => { @@ -1106,8 +1221,8 @@ fn open_path_or_url( "bb" | "bigBed" | "bigbed" => { if isfile { match BigBedReadRaw::open_file(&path_url_or_file_like) { - Ok(bwr) => BigBedRead { - bigbed: BigBedRaw::File(bwr.cached()), + Ok(bwr) => BBIRead { + bbi: BBIReadRaw::BigBedFile(bwr.cached()), } .into_py(py), Err(_) => { @@ -1119,8 +1234,8 @@ fn open_path_or_url( } else { #[cfg(feature = "remote")] match BigBedReadRaw::open(RemoteFile::new(&path_url_or_file_like)) { - Ok(bwr) => BigBedRead { - bigbed: BigBedRaw::Remote(bwr.cached()), + Ok(bwr) => BBIRead { + bbi: BBIReadRaw::BigBedRemote(bwr.cached()), } .into_py(py), Err(_) => { @@ -1275,12 +1390,10 @@ fn pybigtools(_py: Python, m: &PyModule) -> PyResult<()> { m.add_class::()?; m.add_class::()?; - m.add_class::()?; - m.add_class::()?; - - m.add_class::()?; + m.add_class::()?; - m.add_class::()?; + m.add_class::()?; + m.add_class::()?; Ok(()) } From 07fe3b6587e39bcaf0e75963b9ea9277a1db5140 Mon Sep 17 00:00:00 2001 From: Jack Huey <31162821+jackh726@users.noreply.github.com> Date: Fri, 15 Mar 2024 20:57:35 -0400 Subject: [PATCH 03/30] Overhaul pybigtools API --- pybigtools/src/lib.rs | 463 +++++++++++++++++++++++++++++++++++------- 1 file changed, 391 insertions(+), 72 deletions(-) diff --git a/pybigtools/src/lib.rs b/pybigtools/src/lib.rs index 08f0d8b..a5731d1 100644 --- a/pybigtools/src/lib.rs +++ b/pybigtools/src/lib.rs @@ -1,5 +1,6 @@ #![allow(non_snake_case)] +use std::collections::VecDeque; use std::fs::File; use std::io::{self, BufReader}; use std::path::Path; @@ -21,6 +22,7 @@ use bigtools::{ use bigtools::utils::reopen::Reopen; use file_like::PyFileLikeObject; +use numpy::ndarray::Array1; use numpy::IntoPyArray; use pyo3::exceptions::{self, PyTypeError}; use pyo3::prelude::*; @@ -81,6 +83,229 @@ enum BBIReadRaw { BigBedFileLike(BigBedReadRaw>), } +#[derive(Copy, Clone, Debug)] +enum Summary { + Mean, + Min, + Max, +} + +fn to_array_entry_bins>>( + start: usize, + end: usize, + iter: I, + summary: Summary, + bins: usize, +) -> Result, BBIReadError> { + use numpy::ndarray::Array; + let mut bin_data: VecDeque<(usize, usize, usize, Vec)> = VecDeque::new(); + let mut v = vec![0.0; bins]; + let bin_size = (end - start) as f64 / bins as f64; + for interval in iter { + let interval = interval?; + let interval_start = (interval.start as usize).max(start) - start; + let interval_end = (interval.end as usize).min(end) - start; + let bin_start = ((interval_start as f64) / bin_size) as usize; + let bin_end = ((interval_end as f64) / bin_size).ceil() as usize; + + while let Some(front) = bin_data.front_mut() { + if front.0 < bin_start { + let front = bin_data.pop_front().unwrap(); + let bin = front.0; + + match summary { + Summary::Min => { + v[bin] = front + .3 + .into_iter() + .reduce(|min, v| min.min(v)) + .unwrap_or(0.0); + } + Summary::Max => { + v[bin] = front + .3 + .into_iter() + .reduce(|max, v| max.max(v)) + .unwrap_or(0.0); + } + Summary::Mean => { + v[bin] = front.3.into_iter().sum::() / bin_size; + } + } + } else { + break; + } + } + while let Some(bin) = bin_data + .back_mut() + .map(|b| ((b.0 + 1) < bin_end).then(|| b.0 + 1)) + .unwrap_or(Some(bin_start)) + { + let bin_start = ((bin as f64) * bin_size).ceil() as usize; + let bin_end = (((bin + 1) as f64) * bin_size).ceil() as usize; + + bin_data.push_back((bin, bin_start, bin_end, vec![0.0; bin_end - bin_start])); + } + for bin in bin_data.iter() { + assert!( + (bin_start..bin_end).contains(&bin.0), + "{} not in {}..{}", + bin.0, + bin_start, + bin_end + ); + } + for bin in bin_start..bin_end { + assert!(bin_data.iter().find(|b| b.0 == bin).is_some()); + } + for (_bin, bin_start, bin_end, data) in bin_data.iter_mut() { + let overlap_start = (*bin_start).max(interval_start); + let overlap_end = (*bin_end).min(interval_end); + + for i in &mut data[(overlap_start - *bin_start)..(overlap_end - *bin_start)] { + *i = *i + 1.0; + } + } + } + while let Some(front) = bin_data.pop_front() { + let bin = front.0; + + match summary { + Summary::Min => { + v[bin] = front + .3 + .into_iter() + .reduce(|min, v| min.min(v)) + .unwrap_or(0.0); + } + Summary::Max => { + v[bin] = front + .3 + .into_iter() + .reduce(|max, v| max.max(v)) + .unwrap_or(0.0); + } + Summary::Mean => { + v[bin] = front.3.into_iter().sum::() / bin_size; + } + } + } + Ok(Array::from(v)) +} + +fn to_entry_array_zoom>>( + start: usize, + end: usize, + iter: I, + summary: Summary, + bins: usize, +) -> Result, BBIReadError> { + use numpy::ndarray::Array; + let mut bin_data: VecDeque<(usize, usize, usize, Vec)> = VecDeque::new(); + let mut v = vec![0.0; bins]; + let bin_size = (end - start) as f64 / bins as f64; + for interval in iter { + let interval = interval?; + let interval_start = (interval.start as usize).max(start) - start; + let interval_end = (interval.end as usize).min(end) - start; + let bin_start = ((interval_start as f64) / bin_size) as usize; + let bin_end = ((interval_end as f64) / bin_size).ceil() as usize; + + while let Some(front) = bin_data.front_mut() { + if front.0 < bin_start { + let front = bin_data.pop_front().unwrap(); + let bin = front.0; + + match summary { + Summary::Min => { + v[bin] = front + .3 + .into_iter() + .reduce(|min, v| min.min(v)) + .unwrap_or(0.0) + .max(0.0); + } + Summary::Max => { + v[bin] = front + .3 + .into_iter() + .reduce(|max, v| max.max(v)) + .unwrap_or(0.0) + .max(0.0); + } + Summary::Mean => { + v[bin] = front.3.into_iter().sum::() / bin_size; + } + } + } else { + break; + } + } + while let Some(bin) = bin_data + .back_mut() + .map(|b| ((b.0 + 1) < bin_end).then(|| b.0 + 1)) + .unwrap_or(Some(bin_start)) + { + let bin_start = ((bin as f64) * bin_size).ceil() as usize; + let bin_end = (((bin + 1) as f64) * bin_size).ceil() as usize; + + bin_data.push_back((bin, bin_start, bin_end, vec![f64::NAN; bin_end - bin_start])); + } + for bin in bin_data.iter() { + assert!( + (bin_start..bin_end).contains(&bin.0), + "{} not in {}..{}", + bin.0, + bin_start, + bin_end + ); + } + for bin in bin_start..bin_end { + assert!(bin_data.iter().find(|b| b.0 == bin).is_some()); + } + for (_bin, bin_start, bin_end, data) in bin_data.iter_mut() { + let overlap_start = (*bin_start).max(interval_start); + let overlap_end = (*bin_end).min(interval_end); + + let mean = interval.summary.sum / (interval.end - interval.start) as f64; + for i in &mut data[(overlap_start - *bin_start)..(overlap_end - *bin_start)] { + match summary { + Summary::Mean => *i += mean, + Summary::Min => *i = i.min(interval.summary.min_val), + Summary::Max => *i = i.max(interval.summary.max_val), + } + *i = i.max(0.0); + } + } + } + while let Some(front) = bin_data.pop_front() { + let bin = front.0; + + match summary { + Summary::Min => { + v[bin] = front + .3 + .into_iter() + .reduce(|min, v| min.min(v)) + .unwrap_or(0.0) + .max(0.0); + } + Summary::Max => { + v[bin] = front + .3 + .into_iter() + .reduce(|max, v| max.max(v)) + .unwrap_or(0.0) + .max(0.0); + } + Summary::Mean => { + v[bin] = front.3.into_iter().sum::() / bin_size; + } + } + } + Ok(Array::from(v)) +} + /// This class is the interface for reading a bigWig or bigBed. #[pyclass(module = "pybigtools")] struct BBIRead { @@ -255,12 +480,6 @@ impl BBIRead { summary: Option, exact: Option, ) -> PyResult { - enum Summary { - Mean, - Min, - Max, - } - use numpy::ndarray::Array1; fn to_array>>( start: usize, end: usize, @@ -399,48 +618,6 @@ impl BBIRead { } Ok(Array::from(v)) } - fn to_array_entry_bins>>( - start: usize, - end: usize, - iter: I, - bins: usize, - ) -> Result, BBIReadError> { - use numpy::ndarray::Array; - let mut v = vec![0.0; bins]; - let bin_size = (end - start) as f64 / bins as f64; - for interval in iter { - let interval = interval?; - let interval_start = (interval.start as usize) - start; - let interval_end = (interval.end as usize) - start; - let bin_start = ((interval_start as f64) / bin_size) as usize; - let bin_end = ((interval_end as f64) / bin_size).ceil() as usize; - for bin in bin_start..bin_end { - v[bin] += 1.0; - } - } - Ok(Array::from(v)) - } - fn to_entry_array_zoom>>( - start: usize, - end: usize, - iter: I, - bins: usize, - ) -> Result, BBIReadError> { - use numpy::ndarray::Array; - let mut v = vec![0.0; bins]; - let bin_size = (end - start) as f64 / bins as f64; - for interval in iter { - let interval = interval?; - let interval_start = (interval.start as usize).max(start) - start; - let interval_end = (interval.end as usize).min(end) - start; - let bin_start = ((interval_start as f64) / bin_size) as usize; - let bin_end = ((interval_end as f64) / bin_size).ceil() as usize; - for bin in bin_start..bin_end { - v[bin] += interval.summary.total_items as f64; - } - } - Ok(Array::from(v)) - } let summary = match summary.as_deref() { None => Summary::Mean, @@ -555,6 +732,7 @@ impl BBIRead { start: u32, end: u32, bins: Option, + summary: Summary, exact: Option, ) -> PyResult { match bins { @@ -574,17 +752,18 @@ impl BBIRead { PyErr::new::(format!("{}", e)) })?; Python::with_gil(|py| { - Ok( - to_entry_array_zoom(start as usize, end as usize, iter, bins) - .map_err(|e| { - PyErr::new::(format!( - "{}", - e - )) - })? - .into_pyarray(py) - .to_object(py), + Ok(to_entry_array_zoom( + start as usize, + end as usize, + iter, + summary, + bins, ) + .map_err(|e| { + PyErr::new::(format!("{}", e)) + })? + .into_pyarray(py) + .to_object(py)) }) } None => { @@ -592,17 +771,18 @@ impl BBIRead { PyErr::new::(format!("{}", e)) })?; Python::with_gil(|py| { - Ok( - to_array_entry_bins(start as usize, end as usize, iter, bins) - .map_err(|e| { - PyErr::new::(format!( - "{}", - e - )) - })? - .into_pyarray(py) - .to_object(py), + Ok(to_array_entry_bins( + start as usize, + end as usize, + iter, + summary, + bins, ) + .map_err(|e| { + PyErr::new::(format!("{}", e)) + })? + .into_pyarray(py) + .to_object(py)) }) } } @@ -613,7 +793,7 @@ impl BBIRead { .map_err(|e| PyErr::new::(format!("{}", e)))?; Python::with_gil(|py| { Ok( - to_array_entry_bins(start as usize, end as usize, iter, bins) + to_array_entry_bins(start as usize, end as usize, iter, summary, bins) .map_err(|e| { PyErr::new::(format!("{}", e)) })? @@ -648,10 +828,16 @@ impl BBIRead { BBIReadRaw::BigWigFileLike(b) => { intervals_to_array(b, &chrom, start, end, bins, summary, exact) } - BBIReadRaw::BigBedFile(b) => entries_to_array(b, &chrom, start, end, bins, exact), + BBIReadRaw::BigBedFile(b) => { + entries_to_array(b, &chrom, start, end, bins, summary, exact) + } #[cfg(feature = "remote")] - BBIReadRaw::BigBedRemote(b) => entries_to_array(b, &chrom, start, end, bins, exact), - BBIReadRaw::BigBedFileLike(b) => entries_to_array(b, &chrom, start, end, bins, exact), + BBIReadRaw::BigBedRemote(b) => { + entries_to_array(b, &chrom, start, end, bins, summary, exact) + } + BBIReadRaw::BigBedFileLike(b) => { + entries_to_array(b, &chrom, start, end, bins, summary, exact) + } } } @@ -1397,3 +1583,136 @@ fn pybigtools(_py: Python, m: &PyModule) -> PyResult<()> { Ok(()) } + +#[cfg(test)] +mod test { + use bigtools::BedEntry; + + use crate::to_array_entry_bins; + + #[test] + fn test_to_array_entry_bins() { + let entries = [BedEntry { + start: 10, + end: 20, + rest: "".to_string(), + }]; + let res = to_array_entry_bins( + 10, + 20, + entries.into_iter().map(|v| Ok(v)), + crate::Summary::Mean, + 1, + ) + .unwrap(); + let res = res.to_vec(); + assert_eq!(res, [1.0].into_iter().collect::>()); + + let entries = [BedEntry { + start: 10, + end: 20, + rest: "".to_string(), + }]; + let res = to_array_entry_bins( + 10, + 20, + entries.into_iter().map(|v| Ok(v)), + crate::Summary::Mean, + 2, + ) + .unwrap(); + let res = res.to_vec(); + assert_eq!(res, [1.0, 1.0].into_iter().collect::>()); + + let entries = [ + BedEntry { + start: 10, + end: 20, + rest: "".to_string(), + }, + BedEntry { + start: 15, + end: 20, + rest: "".to_string(), + }, + ]; + let res = to_array_entry_bins( + 10, + 20, + entries.clone().into_iter().map(|v| Ok(v)), + crate::Summary::Mean, + 2, + ) + .unwrap(); + let res = res.to_vec(); + assert_eq!(res, [1.0, 2.0].into_iter().collect::>()); + let res = to_array_entry_bins( + 10, + 20, + entries.clone().into_iter().map(|v| Ok(v)), + crate::Summary::Min, + 1, + ) + .unwrap(); + let res = res.to_vec(); + assert_eq!(res, [1.0].into_iter().collect::>()); + let res = to_array_entry_bins( + 10, + 20, + entries.clone().into_iter().map(|v| Ok(v)), + crate::Summary::Max, + 1, + ) + .unwrap(); + let res = res.to_vec(); + assert_eq!(res, [2.0].into_iter().collect::>()); + + let entries = [ + BedEntry { + start: 10, + end: 20, + rest: "".to_string(), + }, + BedEntry { + start: 15, + end: 20, + rest: "".to_string(), + }, + BedEntry { + start: 15, + end: 25, + rest: "".to_string(), + }, + ]; + let res = to_array_entry_bins( + 10, + 20, + entries.clone().into_iter().map(|v| Ok(v)), + crate::Summary::Mean, + 2, + ) + .unwrap(); + let res = res.to_vec(); + assert_eq!(res, [1.0, 3.0].into_iter().collect::>()); + let res = to_array_entry_bins( + 10, + 20, + entries.clone().into_iter().map(|v| Ok(v)), + crate::Summary::Min, + 1, + ) + .unwrap(); + let res = res.to_vec(); + assert_eq!(res, [1.0].into_iter().collect::>()); + let res = to_array_entry_bins( + 10, + 20, + entries.clone().into_iter().map(|v| Ok(v)), + crate::Summary::Max, + 1, + ) + .unwrap(); + let res = res.to_vec(); + assert_eq!(res, [3.0].into_iter().collect::>()); + } +} From 0da362ab606bac56dc39652b0e9385b4364496d4 Mon Sep 17 00:00:00 2001 From: Jack Huey <31162821+jackh726@users.noreply.github.com> Date: Mon, 18 Mar 2024 10:17:08 -0400 Subject: [PATCH 04/30] Split rest into fields of tuple --- pybigtools/src/lib.rs | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/pybigtools/src/lib.rs b/pybigtools/src/lib.rs index a5731d1..5b250a3 100644 --- a/pybigtools/src/lib.rs +++ b/pybigtools/src/lib.rs @@ -35,7 +35,6 @@ use url::Url; mod file_like; type ValueTuple = (u32, u32, f32); -type BedEntryTuple = (u32, u32, String); fn start_end( bbi: &BBIReadRaw, @@ -917,12 +916,19 @@ impl BigBedEntriesIterator { Ok(slf.into()) } - fn __next__(mut slf: PyRefMut) -> PyResult> { - Ok(slf - .iter - .next() - .map(|e| e.unwrap()) - .map(|e| (e.start, e.end, e.rest))) + fn __next__(mut slf: PyRefMut) -> PyResult> { + let py = slf.py(); + let next = match slf.iter.next() { + Some(n) => n.map_err(|e| PyErr::new::(format!("{}", e)))?, + None => return Ok(None), + }; + let elements: Vec<_> = [next.start.to_object(py), next.end.to_object(py)] + .into_iter() + .chain(next.rest.split_whitespace().map(|o| o.to_object(py))) + .collect(); + Ok(Some( + PyTuple::new::(py, elements.into_iter()).to_object(py), + )) } } From d205d4d597f2cccc1db5ab142144cf169b40ced3 Mon Sep 17 00:00:00 2001 From: Jack Huey <31162821+jackh726@users.noreply.github.com> Date: Mon, 18 Mar 2024 10:26:02 -0400 Subject: [PATCH 05/30] Add is_bigwig and is_bigbed --- pybigtools/src/lib.rs | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/pybigtools/src/lib.rs b/pybigtools/src/lib.rs index 5b250a3..ea43644 100644 --- a/pybigtools/src/lib.rs +++ b/pybigtools/src/lib.rs @@ -313,6 +313,44 @@ struct BBIRead { #[pymethods] impl BBIRead { + fn is_bigwig(&self) -> bool { + #[cfg(feature = "remote")] + { + matches!( + self.bbi, + BBIReadRaw::BigWigFile(_) + | BBIReadRaw::BigWigRemote(_) + | BBIReadRaw::BigWigFileLike(_) + ) + } + #[cfg(not(feature = "remote"))] + { + matches!( + self.bbi, + BBIReadRaw::BigWigFile(_) | BBIReadRaw::BigWigFileLike(_) + ) + } + } + + fn is_bigbed(&self) -> bool { + #[cfg(feature = "remote")] + { + matches!( + self.bbi, + BBIReadRaw::BigBedFile(_) + | BBIReadRaw::BigBedRemote(_) + | BBIReadRaw::BigBedFileLike(_) + ) + } + #[cfg(not(feature = "remote"))] + { + matches!( + self.bbi, + BBIReadRaw::BigBedFile(_) | BBIReadRaw::BigBedFileLike(_) + ) + } + } + /// Returns the autosql of this bbi file. /// /// For bigBeds, this comes directly from the autosql stored in the file. From d37ebe48013e66482d54d3e9d1379ec888e0409e Mon Sep 17 00:00:00 2001 From: Jack Huey <31162821+jackh726@users.noreply.github.com> Date: Tue, 19 Mar 2024 20:01:34 -0400 Subject: [PATCH 06/30] Move other values creation functions to outer scope and add simplified error conversion --- pybigtools/src/lib.rs | 686 ++++++++++++++++++++---------------------- 1 file changed, 325 insertions(+), 361 deletions(-) diff --git a/pybigtools/src/lib.rs b/pybigtools/src/lib.rs index ea43644..73a9924 100644 --- a/pybigtools/src/lib.rs +++ b/pybigtools/src/lib.rs @@ -89,6 +89,298 @@ enum Summary { Max, } +trait ToPyErr { + fn to_py_err(self) -> PyErr; +} + +impl ToPyErr for bigtools::ZoomIntervalError { + fn to_py_err(self) -> PyErr { + PyErr::new::(format!("{}", self)) + } +} +impl ToPyErr for bigtools::BBIReadError { + fn to_py_err(self) -> PyErr { + PyErr::new::(format!("{}", self)) + } +} + +trait ConvertResult { + fn convert_err(self) -> Result; +} +impl ConvertResult for Result { + fn convert_err(self) -> Result { + self.map_err(|e| e.to_py_err()) + } +} + +fn intervals_to_array( + b: &mut BigWigReadRaw, + chrom: &str, + start: u32, + end: u32, + bins: Option, + summary: Summary, + exact: bool, + missing: f64, + oob: f64, + arr: Option, +) -> PyResult { + let zoom = if let (Some(bins), false) = (bins, exact) { + let max_zoom_size = ((end - start) as f32 / (bins * 2) as f32) as u32; + let zoom = b + .info() + .zoom_headers + .iter() + .filter(|z| z.reduction_level <= max_zoom_size) + .min_by_key(|z| max_zoom_size - z.reduction_level); + zoom + } else { + None + }; + match (bins, zoom) { + (Some(bins), Some(zoom)) => { + let iter = b + .get_zoom_interval(&chrom, start, end, zoom.reduction_level) + .convert_err()?; + Python::with_gil(|py| { + Ok( + to_array_zoom(start as usize, end as usize, iter, summary, bins) + .convert_err()? + .into_pyarray(py) + .to_object(py), + ) + }) + } + (Some(bins), None) => { + let iter = b.get_interval(&chrom, start, end).convert_err()?; + Python::with_gil(|py| { + Ok( + to_array_bins(start as usize, end as usize, iter, summary, bins) + .convert_err()? + .into_pyarray(py) + .to_object(py), + ) + }) + } + _ => { + let iter = b.get_interval(&chrom, start, end).convert_err()?; + Python::with_gil(|py| { + Ok(to_array(start as usize, end as usize, iter) + .convert_err()? + .into_pyarray(py) + .to_object(py)) + }) + } + } +} +fn entries_to_array( + b: &mut BigBedReadRaw, + chrom: &str, + start: u32, + end: u32, + bins: Option, + summary: Summary, + exact: bool, + missing: f64, + oob: f64, + arr: Option, +) -> PyResult { + match bins { + Some(bins) if !exact => { + let max_zoom_size = ((end - start) as f32 / (bins * 2) as f32) as u32; + let zoom = b + .info() + .zoom_headers + .iter() + .filter(|z| z.reduction_level <= max_zoom_size) + .min_by_key(|z| max_zoom_size - z.reduction_level); + match zoom { + Some(zoom) => { + let iter = b + .get_zoom_interval(&chrom, start, end, zoom.reduction_level) + .convert_err()?; + Python::with_gil(|py| { + Ok( + to_entry_array_zoom(start as usize, end as usize, iter, summary, bins) + .convert_err()? + .into_pyarray(py) + .to_object(py), + ) + }) + } + None => { + let iter = b.get_interval(&chrom, start, end).convert_err()?; + Python::with_gil(|py| { + Ok( + to_array_entry_bins(start as usize, end as usize, iter, summary, bins) + .convert_err()? + .into_pyarray(py) + .to_object(py), + ) + }) + } + } + } + Some(bins) => { + let iter = b.get_interval(&chrom, start, end).convert_err()?; + Python::with_gil(|py| { + Ok( + to_array_entry_bins(start as usize, end as usize, iter, summary, bins) + .convert_err()? + .into_pyarray(py) + .to_object(py), + ) + }) + } + _ => { + let iter = b.get_interval(&chrom, start, end).convert_err()?; + Python::with_gil(|py| { + Ok(to_entry_array(start as usize, end as usize, iter) + .convert_err()? + .into_pyarray(py) + .to_object(py)) + }) + } + } +} +fn to_array>>( + start: usize, + end: usize, + iter: I, +) -> Result, BBIReadError> { + use numpy::ndarray::Array; + let mut v = vec![f64::NAN; end - start]; + for interval in iter { + let interval = interval?; + let interval_start = (interval.start as usize) - start; + let interval_end = (interval.end as usize) - start; + for i in v[interval_start..interval_end].iter_mut() { + *i = interval.value as f64; + } + } + Ok(Array::from(v)) +} +fn to_array_bins>>( + start: usize, + end: usize, + iter: I, + summary: Summary, + bins: usize, +) -> Result, BBIReadError> { + use numpy::ndarray::Array; + let mut v = match summary { + Summary::Min | Summary::Max => vec![f64::NAN; bins], + Summary::Mean => vec![0.0; bins], + }; + let bin_size = (end - start) as f64 / bins as f64; + for interval in iter { + let interval = interval?; + let interval_start = (interval.start as usize) - start; + let interval_end = (interval.end as usize) - start; + let bin_start = ((interval_start as f64) / bin_size) as usize; + let bin_end = ((interval_end as f64) / bin_size).ceil() as usize; + for bin in bin_start..bin_end { + let bin_start = (bin as f64) * bin_size; + let bin_end = (bin as f64 + 1.0) * bin_size; + let interval_start = (interval_start as f64).max(bin_start); + let interval_end = (interval_end as f64).min(bin_end); + // Possible overlaps + // bin |-----| |------| |-----| + // value |----| |----| |----| + // overlap ---- ------ ---- + match summary { + Summary::Min => { + v[bin] = v[bin].min(interval.value as f64); + } + Summary::Max => { + v[bin] = v[bin].max(interval.value as f64); + } + Summary::Mean => { + let overlap_size = interval_end - interval_start; + v[bin] += (overlap_size as f64) * interval.value as f64; + } + } + } + } + if let Summary::Mean = summary { + let last = v.last().copied().expect("Should be at least one bin."); + let last_size = (end - start) as f64 - (bins as f64 - 1.0) * bin_size; + v = v.into_iter().map(|v| v / bin_size as f64).collect(); + // The last bin could be smaller + *v.last_mut().expect("Should be at least one bin.") = last / last_size; + } + Ok(Array::from(v)) +} +fn to_array_zoom>>( + start: usize, + end: usize, + iter: I, + summary: Summary, + bins: usize, +) -> Result, BBIReadError> { + use numpy::ndarray::Array; + let mut v = match summary { + Summary::Min | Summary::Max => vec![f64::NAN; bins], + Summary::Mean => vec![0.0; bins], + }; + let bin_size = (end - start) as f64 / bins as f64; + for interval in iter { + let interval = interval?; + let interval_start = (interval.start as usize).max(start) - start; + let interval_end = (interval.end as usize).min(end) - start; + let bin_start = ((interval_start as f64) / bin_size) as usize; + let bin_end = ((interval_end as f64) / bin_size).ceil() as usize; + for bin in bin_start..bin_end { + let bin_start = (bin as f64) * bin_size; + let bin_end = (bin as f64 + 1.0) * bin_size; + let interval_start = (interval_start as f64).max(bin_start); + let interval_end = (interval_end as f64).min(bin_end); + // Possible overlaps + // bin |-----| |------| |-----| + // value |----| |----| |----| + // overlap ---- ------ ---- + match summary { + Summary::Min => { + v[bin] = v[bin].min(interval.summary.min_val as f64); + } + Summary::Max => { + v[bin] = v[bin].max(interval.summary.max_val as f64); + } + Summary::Mean => { + let overlap_size = interval_end - interval_start; + let zoom_mean = + (interval.summary.sum as f64) / (interval.summary.bases_covered as f64); + v[bin] += (overlap_size as f64) * zoom_mean; + } + } + } + } + if let Summary::Mean = summary { + let last = v.last().copied().expect("Should be at least one bin."); + let last_size = (end - start) as f64 - (bins as f64 - 1.0) * bin_size; + v = v.into_iter().map(|v| v / bin_size as f64).collect(); + // The last bin could be smaller + *v.last_mut().expect("Should be at least one bin.") = last / last_size; + } + Ok(Array::from(v)) +} +fn to_entry_array>>( + start: usize, + end: usize, + iter: I, +) -> Result, BBIReadError> { + use numpy::ndarray::Array; + let mut v = vec![0.0; end - start]; + for interval in iter { + let interval = interval?; + let interval_start = (interval.start as usize) - start; + let interval_end = (interval.end as usize) - start; + for i in v[interval_start..interval_end].iter_mut() { + *i += 1.0; + } + } + Ok(Array::from(v)) +} fn to_array_entry_bins>>( start: usize, end: usize, @@ -381,15 +673,9 @@ impl BBIRead { BBIReadRaw::BigWigFile(_) | BBIReadRaw::BigWigRemote(_) | BBIReadRaw::BigWigFileLike(_) => BEDGRAPH.to_string(), - BBIReadRaw::BigBedFile(b) => b - .autosql() - .map_err(|e| PyErr::new::(format!("{}", e)))?, - BBIReadRaw::BigBedRemote(b) => b - .autosql() - .map_err(|e| PyErr::new::(format!("{}", e)))?, - BBIReadRaw::BigBedFileLike(b) => b - .autosql() - .map_err(|e| PyErr::new::(format!("{}", e)))?, + BBIReadRaw::BigBedFile(b) => b.autosql().convert_err()?, + BBIReadRaw::BigBedRemote(b) => b.autosql().convert_err()?, + BBIReadRaw::BigBedFileLike(b) => b.autosql().convert_err()?, }; let obj = if parse { let mut declarations = parse_autosql(&schema).map_err(|_| { @@ -508,159 +794,23 @@ impl BBIRead { /// If start is not provided, it defaults to the beginning of the chromosome. /// /// This returns a numpy array. + #[pyo3(signature = (chrom, start, end, bins=None, summary="mean".to_string(), exact=false, missing=0.0, oob=f64::NAN, arr=None))] fn values( &mut self, chrom: String, start: Option, end: Option, bins: Option, - summary: Option, - exact: Option, + summary: String, + exact: bool, + missing: f64, + oob: f64, + arr: Option, ) -> PyResult { - fn to_array>>( - start: usize, - end: usize, - iter: I, - ) -> Result, BBIReadError> { - use numpy::ndarray::Array; - let mut v = vec![f64::NAN; end - start]; - for interval in iter { - let interval = interval?; - let interval_start = (interval.start as usize) - start; - let interval_end = (interval.end as usize) - start; - for i in v[interval_start..interval_end].iter_mut() { - *i = interval.value as f64; - } - } - Ok(Array::from(v)) - } - fn to_array_bins>>( - start: usize, - end: usize, - iter: I, - summary: Summary, - bins: usize, - ) -> Result, BBIReadError> { - use numpy::ndarray::Array; - let mut v = match summary { - Summary::Min | Summary::Max => vec![f64::NAN; bins], - Summary::Mean => vec![0.0; bins], - }; - let bin_size = (end - start) as f64 / bins as f64; - for interval in iter { - let interval = interval?; - let interval_start = (interval.start as usize) - start; - let interval_end = (interval.end as usize) - start; - let bin_start = ((interval_start as f64) / bin_size) as usize; - let bin_end = ((interval_end as f64) / bin_size).ceil() as usize; - for bin in bin_start..bin_end { - let bin_start = (bin as f64) * bin_size; - let bin_end = (bin as f64 + 1.0) * bin_size; - let interval_start = (interval_start as f64).max(bin_start); - let interval_end = (interval_end as f64).min(bin_end); - // Possible overlaps - // bin |-----| |------| |-----| - // value |----| |----| |----| - // overlap ---- ------ ---- - match summary { - Summary::Min => { - v[bin] = v[bin].min(interval.value as f64); - } - Summary::Max => { - v[bin] = v[bin].max(interval.value as f64); - } - Summary::Mean => { - let overlap_size = interval_end - interval_start; - v[bin] += (overlap_size as f64) * interval.value as f64; - } - } - } - } - if let Summary::Mean = summary { - let last = v.last().copied().expect("Should be at least one bin."); - let last_size = (end - start) as f64 - (bins as f64 - 1.0) * bin_size; - v = v.into_iter().map(|v| v / bin_size as f64).collect(); - // The last bin could be smaller - *v.last_mut().expect("Should be at least one bin.") = last / last_size; - } - Ok(Array::from(v)) - } - fn to_array_zoom>>( - start: usize, - end: usize, - iter: I, - summary: Summary, - bins: usize, - ) -> Result, BBIReadError> { - use numpy::ndarray::Array; - let mut v = match summary { - Summary::Min | Summary::Max => vec![f64::NAN; bins], - Summary::Mean => vec![0.0; bins], - }; - let bin_size = (end - start) as f64 / bins as f64; - for interval in iter { - let interval = interval?; - let interval_start = (interval.start as usize).max(start) - start; - let interval_end = (interval.end as usize).min(end) - start; - let bin_start = ((interval_start as f64) / bin_size) as usize; - let bin_end = ((interval_end as f64) / bin_size).ceil() as usize; - for bin in bin_start..bin_end { - let bin_start = (bin as f64) * bin_size; - let bin_end = (bin as f64 + 1.0) * bin_size; - let interval_start = (interval_start as f64).max(bin_start); - let interval_end = (interval_end as f64).min(bin_end); - // Possible overlaps - // bin |-----| |------| |-----| - // value |----| |----| |----| - // overlap ---- ------ ---- - match summary { - Summary::Min => { - v[bin] = v[bin].min(interval.summary.min_val as f64); - } - Summary::Max => { - v[bin] = v[bin].max(interval.summary.max_val as f64); - } - Summary::Mean => { - let overlap_size = interval_end - interval_start; - let zoom_mean = (interval.summary.sum as f64) - / (interval.summary.bases_covered as f64); - v[bin] += (overlap_size as f64) * zoom_mean; - } - } - } - } - if let Summary::Mean = summary { - let last = v.last().copied().expect("Should be at least one bin."); - let last_size = (end - start) as f64 - (bins as f64 - 1.0) * bin_size; - v = v.into_iter().map(|v| v / bin_size as f64).collect(); - // The last bin could be smaller - *v.last_mut().expect("Should be at least one bin.") = last / last_size; - } - Ok(Array::from(v)) - } - fn to_entry_array>>( - start: usize, - end: usize, - iter: I, - ) -> Result, BBIReadError> { - use numpy::ndarray::Array; - let mut v = vec![0.0; end - start]; - for interval in iter { - let interval = interval?; - let interval_start = (interval.start as usize) - start; - let interval_end = (interval.end as usize) - start; - for i in v[interval_start..interval_end].iter_mut() { - *i += 1.0; - } - } - Ok(Array::from(v)) - } - - let summary = match summary.as_deref() { - None => Summary::Mean, - Some("mean") => Summary::Mean, - Some("min") => Summary::Min, - Some("max") => Summary::Max, + let summary = match summary.as_ref() { + "mean" => Summary::Mean, + "min" => Summary::Min, + "max" => Summary::Max, _ => { return Err(PyErr::new::(format!( "Unrecognized summary. Only `mean`, `min`, and `max` are allowed." @@ -668,213 +818,27 @@ impl BBIRead { } }; let (start, end) = start_end(&self.bbi, &chrom, start, end)?; - fn intervals_to_array( - b: &mut BigWigReadRaw, - chrom: &str, - start: u32, - end: u32, - bins: Option, - summary: Summary, - exact: Option, - ) -> PyResult { - match bins { - Some(bins) if !exact.unwrap_or(false) => { - let max_zoom_size = ((end - start) as f32 / (bins * 2) as f32) as u32; - let zoom = b - .info() - .zoom_headers - .iter() - .filter(|z| z.reduction_level <= max_zoom_size) - .min_by_key(|z| max_zoom_size - z.reduction_level); - match zoom { - Some(zoom) => { - let iter = b - .get_zoom_interval(&chrom, start, end, zoom.reduction_level) - .map_err(|e| { - PyErr::new::(format!("{}", e)) - })?; - Python::with_gil(|py| { - Ok( - to_array_zoom( - start as usize, - end as usize, - iter, - summary, - bins, - ) - .map_err(|e| { - PyErr::new::(format!("{}", e)) - })? - .into_pyarray(py) - .to_object(py), - ) - }) - } - None => { - let iter = b.get_interval(&chrom, start, end).map_err(|e| { - PyErr::new::(format!("{}", e)) - })?; - Python::with_gil(|py| { - Ok( - to_array_bins( - start as usize, - end as usize, - iter, - summary, - bins, - ) - .map_err(|e| { - PyErr::new::(format!("{}", e)) - })? - .into_pyarray(py) - .to_object(py), - ) - }) - } - } - } - Some(bins) => { - let iter = b - .get_interval(&chrom, start, end) - .map_err(|e| PyErr::new::(format!("{}", e)))?; - Python::with_gil(|py| { - Ok( - to_array_bins(start as usize, end as usize, iter, summary, bins) - .map_err(|e| { - PyErr::new::(format!("{}", e)) - })? - .into_pyarray(py) - .to_object(py), - ) - }) - } - _ => { - let iter = b - .get_interval(&chrom, start, end) - .map_err(|e| PyErr::new::(format!("{}", e)))?; - Python::with_gil(|py| { - Ok(to_array(start as usize, end as usize, iter) - .map_err(|e| { - PyErr::new::(format!("{}", e)) - })? - .into_pyarray(py) - .to_object(py)) - }) - } - } - } - fn entries_to_array( - b: &mut BigBedReadRaw, - chrom: &str, - start: u32, - end: u32, - bins: Option, - summary: Summary, - exact: Option, - ) -> PyResult { - match bins { - Some(bins) if !exact.unwrap_or(false) => { - let max_zoom_size = ((end - start) as f32 / (bins * 2) as f32) as u32; - let zoom = b - .info() - .zoom_headers - .iter() - .filter(|z| z.reduction_level <= max_zoom_size) - .min_by_key(|z| max_zoom_size - z.reduction_level); - match zoom { - Some(zoom) => { - let iter = b - .get_zoom_interval(&chrom, start, end, zoom.reduction_level) - .map_err(|e| { - PyErr::new::(format!("{}", e)) - })?; - Python::with_gil(|py| { - Ok(to_entry_array_zoom( - start as usize, - end as usize, - iter, - summary, - bins, - ) - .map_err(|e| { - PyErr::new::(format!("{}", e)) - })? - .into_pyarray(py) - .to_object(py)) - }) - } - None => { - let iter = b.get_interval(&chrom, start, end).map_err(|e| { - PyErr::new::(format!("{}", e)) - })?; - Python::with_gil(|py| { - Ok(to_array_entry_bins( - start as usize, - end as usize, - iter, - summary, - bins, - ) - .map_err(|e| { - PyErr::new::(format!("{}", e)) - })? - .into_pyarray(py) - .to_object(py)) - }) - } - } - } - Some(bins) => { - let iter = b - .get_interval(&chrom, start, end) - .map_err(|e| PyErr::new::(format!("{}", e)))?; - Python::with_gil(|py| { - Ok( - to_array_entry_bins(start as usize, end as usize, iter, summary, bins) - .map_err(|e| { - PyErr::new::(format!("{}", e)) - })? - .into_pyarray(py) - .to_object(py), - ) - }) - } - _ => { - let iter = b - .get_interval(&chrom, start, end) - .map_err(|e| PyErr::new::(format!("{}", e)))?; - Python::with_gil(|py| { - Ok(to_entry_array(start as usize, end as usize, iter) - .map_err(|e| { - PyErr::new::(format!("{}", e)) - })? - .into_pyarray(py) - .to_object(py)) - }) - } - } - } match &mut self.bbi { - BBIReadRaw::BigWigFile(b) => { - intervals_to_array(b, &chrom, start, end, bins, summary, exact) - } + BBIReadRaw::BigWigFile(b) => intervals_to_array( + b, &chrom, start, end, bins, summary, exact, missing, oob, arr, + ), #[cfg(feature = "remote")] - BBIReadRaw::BigWigRemote(b) => { - intervals_to_array(b, &chrom, start, end, bins, summary, exact) - } - BBIReadRaw::BigWigFileLike(b) => { - intervals_to_array(b, &chrom, start, end, bins, summary, exact) - } - BBIReadRaw::BigBedFile(b) => { - entries_to_array(b, &chrom, start, end, bins, summary, exact) - } + BBIReadRaw::BigWigRemote(b) => intervals_to_array( + b, &chrom, start, end, bins, summary, exact, missing, oob, arr, + ), + BBIReadRaw::BigWigFileLike(b) => intervals_to_array( + b, &chrom, start, end, bins, summary, exact, missing, oob, arr, + ), + BBIReadRaw::BigBedFile(b) => entries_to_array( + b, &chrom, start, end, bins, summary, exact, missing, oob, arr, + ), #[cfg(feature = "remote")] - BBIReadRaw::BigBedRemote(b) => { - entries_to_array(b, &chrom, start, end, bins, summary, exact) - } - BBIReadRaw::BigBedFileLike(b) => { - entries_to_array(b, &chrom, start, end, bins, summary, exact) - } + BBIReadRaw::BigBedRemote(b) => entries_to_array( + b, &chrom, start, end, bins, summary, exact, missing, oob, arr, + ), + BBIReadRaw::BigBedFileLike(b) => entries_to_array( + b, &chrom, start, end, bins, summary, exact, missing, oob, arr, + ), } } @@ -938,7 +902,7 @@ impl BigWigIntervalIterator { .next() .transpose() .map(|o| o.map(|v| (v.start, v.end, v.value))) - .map_err(|e| PyErr::new::(format!("{}", e))) + .convert_err() } } @@ -957,7 +921,7 @@ impl BigBedEntriesIterator { fn __next__(mut slf: PyRefMut) -> PyResult> { let py = slf.py(); let next = match slf.iter.next() { - Some(n) => n.map_err(|e| PyErr::new::(format!("{}", e)))?, + Some(n) => n.convert_err()?, None => return Ok(None), }; let elements: Vec<_> = [next.start.to_object(py), next.end.to_object(py)] From 51198d1e0d39d51284c6d11767e38de484d71162 Mon Sep 17 00:00:00 2001 From: Jack Huey <31162821+jackh726@users.noreply.github.com> Date: Fri, 22 Mar 2024 20:16:22 -0400 Subject: [PATCH 07/30] Accept i32 not u32 --- pybigtools/src/lib.rs | 232 ++++++++++++++++++++++++------------------ 1 file changed, 132 insertions(+), 100 deletions(-) diff --git a/pybigtools/src/lib.rs b/pybigtools/src/lib.rs index 73a9924..2d583ff 100644 --- a/pybigtools/src/lib.rs +++ b/pybigtools/src/lib.rs @@ -65,6 +65,45 @@ fn start_end( return Ok((start.unwrap_or(0), end.unwrap_or(length))); } +fn bigwig_start_end_length( + bbi: &BigWigReadRaw, + chrom_name: &str, + start: Option, + end: Option, +) -> PyResult<(i32, i32, i32)> { + let chroms = bbi.chroms(); + start_end_length_inner(chrom_name, chroms, start, end) +} + +fn bigbed_start_end_length( + bbi: &BigBedReadRaw, + chrom_name: &str, + start: Option, + end: Option, +) -> PyResult<(i32, i32, i32)> { + let chroms = bbi.chroms(); + start_end_length_inner(chrom_name, chroms, start, end) +} + +fn start_end_length_inner( + chrom_name: &str, + chroms: &[bigtools::ChromInfo], + start: Option, + end: Option, +) -> PyResult<(i32, i32, i32)> { + let chrom = chroms.into_iter().find(|x| x.name == chrom_name); + let length = match chrom { + None => { + return Err(PyErr::new::(format!( + "No chromomsome with name `{}` found.", + chrom_name + ))) + } + Some(c) => c.length as i32, + }; + return Ok((start.unwrap_or(0), end.unwrap_or(length), length)); +} + impl Reopen for PyFileLikeObject { fn reopen(&self) -> io::Result { Ok(self.clone()) @@ -116,8 +155,8 @@ impl ConvertResult for Result { fn intervals_to_array( b: &mut BigWigReadRaw, chrom: &str, - start: u32, - end: u32, + start: Option, + end: Option, bins: Option, summary: Summary, exact: bool, @@ -125,6 +164,7 @@ fn intervals_to_array( oob: f64, arr: Option, ) -> PyResult { + let (start, end, length) = bigwig_start_end_length(b, chrom, start, end)?; let zoom = if let (Some(bins), false) = (bins, exact) { let max_zoom_size = ((end - start) as f32 / (bins * 2) as f32) as u32; let zoom = b @@ -137,14 +177,15 @@ fn intervals_to_array( } else { None }; + let (intervals_start, intervals_end) = (start.max(0) as u32, end.min(length) as u32); match (bins, zoom) { (Some(bins), Some(zoom)) => { let iter = b - .get_zoom_interval(&chrom, start, end, zoom.reduction_level) + .get_zoom_interval(&chrom, intervals_start, intervals_end, zoom.reduction_level) .convert_err()?; Python::with_gil(|py| { Ok( - to_array_zoom(start as usize, end as usize, iter, summary, bins) + to_array_zoom(start, end, iter, summary, bins) .convert_err()? .into_pyarray(py) .to_object(py), @@ -152,10 +193,10 @@ fn intervals_to_array( }) } (Some(bins), None) => { - let iter = b.get_interval(&chrom, start, end).convert_err()?; + let iter = b.get_interval(&chrom, intervals_start, intervals_end).convert_err()?; Python::with_gil(|py| { Ok( - to_array_bins(start as usize, end as usize, iter, summary, bins) + to_array_bins(start, end, iter, summary, bins) .convert_err()? .into_pyarray(py) .to_object(py), @@ -163,9 +204,9 @@ fn intervals_to_array( }) } _ => { - let iter = b.get_interval(&chrom, start, end).convert_err()?; + let iter = b.get_interval(&chrom, intervals_start, intervals_end).convert_err()?; Python::with_gil(|py| { - Ok(to_array(start as usize, end as usize, iter) + Ok(to_array(start, end, iter) .convert_err()? .into_pyarray(py) .to_object(py)) @@ -176,8 +217,8 @@ fn intervals_to_array( fn entries_to_array( b: &mut BigBedReadRaw, chrom: &str, - start: u32, - end: u32, + start: Option, + end: Option, bins: Option, summary: Summary, exact: bool, @@ -185,47 +226,39 @@ fn entries_to_array( oob: f64, arr: Option, ) -> PyResult { - match bins { - Some(bins) if !exact => { - let max_zoom_size = ((end - start) as f32 / (bins * 2) as f32) as u32; - let zoom = b - .info() - .zoom_headers - .iter() - .filter(|z| z.reduction_level <= max_zoom_size) - .min_by_key(|z| max_zoom_size - z.reduction_level); - match zoom { - Some(zoom) => { - let iter = b - .get_zoom_interval(&chrom, start, end, zoom.reduction_level) - .convert_err()?; - Python::with_gil(|py| { - Ok( - to_entry_array_zoom(start as usize, end as usize, iter, summary, bins) - .convert_err()? - .into_pyarray(py) - .to_object(py), - ) - }) - } - None => { - let iter = b.get_interval(&chrom, start, end).convert_err()?; - Python::with_gil(|py| { - Ok( - to_array_entry_bins(start as usize, end as usize, iter, summary, bins) - .convert_err()? - .into_pyarray(py) - .to_object(py), - ) - }) - } - } + let (start, end, length) = bigbed_start_end_length(b, chrom, start, end)?; + let zoom = if let (Some(bins), false) = (bins, exact) { + let max_zoom_size = ((end - start) as f32 / (bins * 2) as f32) as u32; + let zoom = b + .info() + .zoom_headers + .iter() + .filter(|z| z.reduction_level <= max_zoom_size) + .min_by_key(|z| max_zoom_size - z.reduction_level); + zoom + } else { + None + }; + let (intervals_start, intervals_end) = (start.max(0) as u32, end.min(length) as u32); + match (bins, zoom) { + (Some(bins), Some(zoom)) => { + let iter = b + .get_zoom_interval(&chrom, intervals_start, intervals_end, zoom.reduction_level) + .convert_err()?; + Python::with_gil(|py| { + Ok( + to_entry_array_zoom(start, end, iter, summary, bins) + .convert_err()? + .into_pyarray(py) + .to_object(py), + ) + }) } - Some(bins) => { - let iter = b.get_interval(&chrom, start, end).convert_err()?; + (Some(bins), None) => { + let iter = b.get_interval(&chrom, intervals_start, intervals_end).convert_err()?; Python::with_gil(|py| { Ok( - to_array_entry_bins(start as usize, end as usize, iter, summary, bins) + to_entry_array_bins(start, end, iter, summary, bins) .convert_err()? .into_pyarray(py) .to_object(py), @@ -233,9 +266,9 @@ fn entries_to_array( }) } _ => { - let iter = b.get_interval(&chrom, start, end).convert_err()?; + let iter = b.get_interval(&chrom, intervals_start, intervals_end).convert_err()?; Python::with_gil(|py| { - Ok(to_entry_array(start as usize, end as usize, iter) + Ok(to_entry_array(start, end, iter) .convert_err()? .into_pyarray(py) .to_object(py)) @@ -244,16 +277,16 @@ fn entries_to_array( } } fn to_array>>( - start: usize, - end: usize, + start: i32, + end: i32, iter: I, ) -> Result, BBIReadError> { use numpy::ndarray::Array; - let mut v = vec![f64::NAN; end - start]; + let mut v = vec![f64::NAN; (end - start) as usize]; for interval in iter { let interval = interval?; - let interval_start = (interval.start as usize) - start; - let interval_end = (interval.end as usize) - start; + let interval_start = ((interval.start as i32) - start) as usize; + let interval_end = ((interval.end as i32) - start) as usize; for i in v[interval_start..interval_end].iter_mut() { *i = interval.value as f64; } @@ -261,8 +294,8 @@ fn to_array>>( Ok(Array::from(v)) } fn to_array_bins>>( - start: usize, - end: usize, + start: i32, + end: i32, iter: I, summary: Summary, bins: usize, @@ -275,8 +308,8 @@ fn to_array_bins>>( let bin_size = (end - start) as f64 / bins as f64; for interval in iter { let interval = interval?; - let interval_start = (interval.start as usize) - start; - let interval_end = (interval.end as usize) - start; + let interval_start = (interval.start as i32) - start; + let interval_end = (interval.end as i32) - start; let bin_start = ((interval_start as f64) / bin_size) as usize; let bin_end = ((interval_end as f64) / bin_size).ceil() as usize; for bin in bin_start..bin_end { @@ -312,8 +345,8 @@ fn to_array_bins>>( Ok(Array::from(v)) } fn to_array_zoom>>( - start: usize, - end: usize, + start: i32, + end: i32, iter: I, summary: Summary, bins: usize, @@ -326,8 +359,8 @@ fn to_array_zoom>>( let bin_size = (end - start) as f64 / bins as f64; for interval in iter { let interval = interval?; - let interval_start = (interval.start as usize).max(start) - start; - let interval_end = (interval.end as usize).min(end) - start; + let interval_start = (interval.start as i32).max(start) - start; + let interval_end = (interval.end as i32).min(end) - start; let bin_start = ((interval_start as f64) / bin_size) as usize; let bin_end = ((interval_end as f64) / bin_size).ceil() as usize; for bin in bin_start..bin_end { @@ -365,37 +398,37 @@ fn to_array_zoom>>( Ok(Array::from(v)) } fn to_entry_array>>( - start: usize, - end: usize, + start: i32, + end: i32, iter: I, ) -> Result, BBIReadError> { use numpy::ndarray::Array; - let mut v = vec![0.0; end - start]; + let mut v = vec![0.0; (end - start) as usize]; for interval in iter { let interval = interval?; - let interval_start = (interval.start as usize) - start; - let interval_end = (interval.end as usize) - start; + let interval_start = ((interval.start as i32) - start) as usize; + let interval_end = ((interval.end as i32) - start) as usize; for i in v[interval_start..interval_end].iter_mut() { *i += 1.0; } } Ok(Array::from(v)) } -fn to_array_entry_bins>>( - start: usize, - end: usize, +fn to_entry_array_bins>>( + start: i32, + end: i32, iter: I, summary: Summary, bins: usize, ) -> Result, BBIReadError> { use numpy::ndarray::Array; - let mut bin_data: VecDeque<(usize, usize, usize, Vec)> = VecDeque::new(); + let mut bin_data: VecDeque<(usize, i32, i32, Vec)> = VecDeque::new(); let mut v = vec![0.0; bins]; let bin_size = (end - start) as f64 / bins as f64; for interval in iter { let interval = interval?; - let interval_start = (interval.start as usize).max(start) - start; - let interval_end = (interval.end as usize).min(end) - start; + let interval_start = (interval.start as i32).max(start) - start; + let interval_end = (interval.end as i32).min(end) - start; let bin_start = ((interval_start as f64) / bin_size) as usize; let bin_end = ((interval_end as f64) / bin_size).ceil() as usize; @@ -432,10 +465,10 @@ fn to_array_entry_bins>>( .map(|b| ((b.0 + 1) < bin_end).then(|| b.0 + 1)) .unwrap_or(Some(bin_start)) { - let bin_start = ((bin as f64) * bin_size).ceil() as usize; - let bin_end = (((bin + 1) as f64) * bin_size).ceil() as usize; + let bin_start = ((bin as f64) * bin_size).ceil() as i32; + let bin_end = (((bin + 1) as f64) * bin_size).ceil() as i32; - bin_data.push_back((bin, bin_start, bin_end, vec![0.0; bin_end - bin_start])); + bin_data.push_back((bin, bin_start, bin_end, vec![0.0; (bin_end - bin_start) as usize])); } for bin in bin_data.iter() { assert!( @@ -453,7 +486,7 @@ fn to_array_entry_bins>>( let overlap_start = (*bin_start).max(interval_start); let overlap_end = (*bin_end).min(interval_end); - for i in &mut data[(overlap_start - *bin_start)..(overlap_end - *bin_start)] { + for i in &mut data[((overlap_start - *bin_start) as usize)..((overlap_end - *bin_start) as usize)] { *i = *i + 1.0; } } @@ -485,20 +518,20 @@ fn to_array_entry_bins>>( } fn to_entry_array_zoom>>( - start: usize, - end: usize, + start: i32, + end: i32, iter: I, summary: Summary, bins: usize, ) -> Result, BBIReadError> { use numpy::ndarray::Array; - let mut bin_data: VecDeque<(usize, usize, usize, Vec)> = VecDeque::new(); + let mut bin_data: VecDeque<(usize, i32, i32, Vec)> = VecDeque::new(); let mut v = vec![0.0; bins]; let bin_size = (end - start) as f64 / bins as f64; for interval in iter { let interval = interval?; - let interval_start = (interval.start as usize).max(start) - start; - let interval_end = (interval.end as usize).min(end) - start; + let interval_start = (interval.start as i32).max(start) - start; + let interval_end = (interval.end as i32).min(end) - start; let bin_start = ((interval_start as f64) / bin_size) as usize; let bin_end = ((interval_end as f64) / bin_size).ceil() as usize; @@ -537,10 +570,10 @@ fn to_entry_array_zoom>>( .map(|b| ((b.0 + 1) < bin_end).then(|| b.0 + 1)) .unwrap_or(Some(bin_start)) { - let bin_start = ((bin as f64) * bin_size).ceil() as usize; - let bin_end = (((bin + 1) as f64) * bin_size).ceil() as usize; + let bin_start = ((bin as f64) * bin_size).ceil() as i32; + let bin_end = (((bin + 1) as f64) * bin_size).ceil() as i32; - bin_data.push_back((bin, bin_start, bin_end, vec![f64::NAN; bin_end - bin_start])); + bin_data.push_back((bin, bin_start, bin_end, vec![f64::NAN; (bin_end - bin_start) as usize])); } for bin in bin_data.iter() { assert!( @@ -559,7 +592,7 @@ fn to_entry_array_zoom>>( let overlap_end = (*bin_end).min(interval_end); let mean = interval.summary.sum / (interval.end - interval.start) as f64; - for i in &mut data[(overlap_start - *bin_start)..(overlap_end - *bin_start)] { + for i in &mut data[((overlap_start - *bin_start) as usize)..((overlap_end - *bin_start) as usize)] { match summary { Summary::Mean => *i += mean, Summary::Min => *i = i.min(interval.summary.min_val), @@ -798,8 +831,8 @@ impl BBIRead { fn values( &mut self, chrom: String, - start: Option, - end: Option, + start: Option, + end: Option, bins: Option, summary: String, exact: bool, @@ -817,7 +850,6 @@ impl BBIRead { ))); } }; - let (start, end) = start_end(&self.bbi, &chrom, start, end)?; match &mut self.bbi { BBIReadRaw::BigWigFile(b) => intervals_to_array( b, &chrom, start, end, bins, summary, exact, missing, oob, arr, @@ -1596,16 +1628,16 @@ fn pybigtools(_py: Python, m: &PyModule) -> PyResult<()> { mod test { use bigtools::BedEntry; - use crate::to_array_entry_bins; + use crate::to_entry_array_bins; #[test] - fn test_to_array_entry_bins() { + fn test_to_entry_array_bins() { let entries = [BedEntry { start: 10, end: 20, rest: "".to_string(), }]; - let res = to_array_entry_bins( + let res = to_entry_array_bins( 10, 20, entries.into_iter().map(|v| Ok(v)), @@ -1621,7 +1653,7 @@ mod test { end: 20, rest: "".to_string(), }]; - let res = to_array_entry_bins( + let res = to_entry_array_bins( 10, 20, entries.into_iter().map(|v| Ok(v)), @@ -1644,7 +1676,7 @@ mod test { rest: "".to_string(), }, ]; - let res = to_array_entry_bins( + let res = to_entry_array_bins( 10, 20, entries.clone().into_iter().map(|v| Ok(v)), @@ -1654,7 +1686,7 @@ mod test { .unwrap(); let res = res.to_vec(); assert_eq!(res, [1.0, 2.0].into_iter().collect::>()); - let res = to_array_entry_bins( + let res = to_entry_array_bins( 10, 20, entries.clone().into_iter().map(|v| Ok(v)), @@ -1664,7 +1696,7 @@ mod test { .unwrap(); let res = res.to_vec(); assert_eq!(res, [1.0].into_iter().collect::>()); - let res = to_array_entry_bins( + let res = to_entry_array_bins( 10, 20, entries.clone().into_iter().map(|v| Ok(v)), @@ -1692,7 +1724,7 @@ mod test { rest: "".to_string(), }, ]; - let res = to_array_entry_bins( + let res = to_entry_array_bins( 10, 20, entries.clone().into_iter().map(|v| Ok(v)), @@ -1702,7 +1734,7 @@ mod test { .unwrap(); let res = res.to_vec(); assert_eq!(res, [1.0, 3.0].into_iter().collect::>()); - let res = to_array_entry_bins( + let res = to_entry_array_bins( 10, 20, entries.clone().into_iter().map(|v| Ok(v)), @@ -1712,7 +1744,7 @@ mod test { .unwrap(); let res = res.to_vec(); assert_eq!(res, [1.0].into_iter().collect::>()); - let res = to_array_entry_bins( + let res = to_entry_array_bins( 10, 20, entries.clone().into_iter().map(|v| Ok(v)), From 02bab3d1b8676712b6cb9d24887d0702f0447e0a Mon Sep 17 00:00:00 2001 From: Jack Huey <31162821+jackh726@users.noreply.github.com> Date: Tue, 26 Mar 2024 02:11:32 -0400 Subject: [PATCH 08/30] Allow passing in an array --- pybigtools/src/lib.rs | 646 +++++++++++++++++++++++++++++++----------- 1 file changed, 477 insertions(+), 169 deletions(-) diff --git a/pybigtools/src/lib.rs b/pybigtools/src/lib.rs index 2d583ff..32bf659 100644 --- a/pybigtools/src/lib.rs +++ b/pybigtools/src/lib.rs @@ -3,6 +3,7 @@ use std::collections::VecDeque; use std::fs::File; use std::io::{self, BufReader}; +use std::ops::IndexMut; use std::path::Path; use bigtools::bed::autosql::parse::parse_autosql; @@ -22,8 +23,8 @@ use bigtools::{ use bigtools::utils::reopen::Reopen; use file_like::PyFileLikeObject; -use numpy::ndarray::Array1; -use numpy::IntoPyArray; +use numpy::ndarray::ArrayViewMut; +use numpy::PyArray1; use pyo3::exceptions::{self, PyTypeError}; use pyo3::prelude::*; use pyo3::types::{IntoPyDict, PyAny, PyDict, PyFloat, PyInt, PyIterator, PyString, PyTuple}; @@ -153,6 +154,7 @@ impl ConvertResult for Result { } fn intervals_to_array( + py: Python<'_>, b: &mut BigWigReadRaw, chrom: &str, start: Option, @@ -178,43 +180,129 @@ fn intervals_to_array( None }; let (intervals_start, intervals_end) = (start.max(0) as u32, end.min(length) as u32); - match (bins, zoom) { - (Some(bins), Some(zoom)) => { - let iter = b - .get_zoom_interval(&chrom, intervals_start, intervals_end, zoom.reduction_level) - .convert_err()?; - Python::with_gil(|py| { - Ok( - to_array_zoom(start, end, iter, summary, bins) - .convert_err()? - .into_pyarray(py) - .to_object(py), + match bins { + Some(bins) => { + let arr = match arr { + Some(v) => v, + None => PyArray1::from_vec(py, vec![missing; bins]).to_object(py), + }; + let v: &PyArray1 = arr.downcast::>(py).map_err(|_| { + PyErr::new::( + "`arr` option must be a one-dimensional numpy array, if passed.", ) - }) - } - (Some(bins), None) => { - let iter = b.get_interval(&chrom, intervals_start, intervals_end).convert_err()?; - Python::with_gil(|py| { - Ok( - to_array_bins(start, end, iter, summary, bins) - .convert_err()? - .into_pyarray(py) - .to_object(py), - ) - }) + })?; + { + let mut array = v.readwrite(); + + match zoom { + Some(zoom) => { + let iter = b + .get_zoom_interval( + &chrom, + intervals_start, + intervals_end, + zoom.reduction_level, + ) + .convert_err()?; + to_array_zoom( + start, + end, + iter, + summary, + bins, + missing, + array.as_array_mut(), + ) + .convert_err()?; + } + None => { + let iter = b + .get_interval(&chrom, intervals_start, intervals_end) + .convert_err()?; + to_array_bins( + start, + end, + iter, + summary, + bins, + missing, + array.as_array_mut(), + ) + .convert_err()?; + } + }; + + let mut array: ArrayViewMut<'_, f64, numpy::Ix1> = array.as_array_mut(); + + let bin_size = (end - start) as f64 / bins as f64; + if start < 0 { + let bin_start = 0; + let interval_end = 0 - start; + let bin_end = ((interval_end as f64) / bin_size).ceil() as usize; + for i in bin_start..bin_end { + array[i] = oob; + } + } + if end >= length { + let interval_start = (length as i32) - start; + let interval_end = end - start; + let bin_start = ((interval_start as f64) / bin_size) as usize; + let bin_end = ((interval_end as f64) / bin_size).ceil() as usize; + assert_eq!(bin_end, array.len() - 1); + for i in bin_start..bin_end { + array[i] = oob; + } + } + } + Ok(arr) } _ => { - let iter = b.get_interval(&chrom, intervals_start, intervals_end).convert_err()?; - Python::with_gil(|py| { - Ok(to_array(start, end, iter) - .convert_err()? - .into_pyarray(py) - .to_object(py)) - }) + let arr = match arr { + Some(v) => v, + None => PyArray1::from_vec(py, vec![missing; (end - start) as usize]).to_object(py), + }; + let v: &PyArray1 = arr.downcast::>(py).map_err(|_| { + PyErr::new::( + "`arr` option must be a one-dimensional numpy array, if passed.", + ) + })?; + + { + let mut array = v.readwrite(); + + let iter = b + .get_interval(&chrom, intervals_start, intervals_end) + .convert_err()?; + to_array(start, end, iter, array.as_array_mut()).convert_err()?; + + let mut array: ArrayViewMut<'_, f64, numpy::Ix1> = array.as_array_mut(); + + let bin_size = (end - start) as f64; + if start < 0 { + let bin_start = 0; + let interval_end = 0 - start; + let bin_end = ((interval_end as f64) / bin_size).ceil() as usize; + for i in bin_start..bin_end { + array[i] = oob; + } + } + if end >= length { + let interval_start = (length as i32) - start; + let interval_end = end - start; + let bin_start = ((interval_start as f64) / bin_size) as usize; + let bin_end = ((interval_end as f64) / bin_size).ceil() as usize; + assert_eq!(bin_end, array.len() - 1); + for i in bin_start..bin_end { + array[i] = oob; + } + } + } + Ok(arr) } } } fn entries_to_array( + py: Python<'_>, b: &mut BigBedReadRaw, chrom: &str, start: Option, @@ -240,39 +328,108 @@ fn entries_to_array( None }; let (intervals_start, intervals_end) = (start.max(0) as u32, end.min(length) as u32); - match (bins, zoom) { - (Some(bins), Some(zoom)) => { - let iter = b - .get_zoom_interval(&chrom, intervals_start, intervals_end, zoom.reduction_level) - .convert_err()?; - Python::with_gil(|py| { - Ok( - to_entry_array_zoom(start, end, iter, summary, bins) - .convert_err()? - .into_pyarray(py) - .to_object(py), + match bins { + Some(bins) => { + let arr = match arr { + Some(v) => v, + None => PyArray1::from_vec(py, vec![missing; bins]).to_object(py), + }; + let v: &PyArray1 = arr.downcast::>(py).map_err(|_| { + PyErr::new::( + "`arr` option must be a one-dimensional numpy array, if passed.", ) - }) - } - (Some(bins), None) => { - let iter = b.get_interval(&chrom, intervals_start, intervals_end).convert_err()?; - Python::with_gil(|py| { - Ok( - to_entry_array_bins(start, end, iter, summary, bins) - .convert_err()? - .into_pyarray(py) - .to_object(py), - ) - }) + })?; + { + let mut array = v.readwrite(); + + match zoom { + Some(zoom) => { + let iter = b + .get_zoom_interval( + &chrom, + intervals_start, + intervals_end, + zoom.reduction_level, + ) + .convert_err()?; + to_entry_array_zoom(start, end, iter, summary, bins, array.as_array_mut()) + .convert_err()?; + } + None => { + let iter = b + .get_interval(&chrom, intervals_start, intervals_end) + .convert_err()?; + to_entry_array_bins(start, end, iter, summary, bins, array.as_array_mut()) + .convert_err()?; + } + }; + + let mut array: ArrayViewMut<'_, f64, numpy::Ix1> = array.as_array_mut(); + + let bin_size = (end - start) as f64 / bins as f64; + if start < 0 { + let bin_start = 0; + let interval_end = 0 - start; + let bin_end = ((interval_end as f64) / bin_size).ceil() as usize; + for i in bin_start..bin_end { + array[i] = oob; + } + } + if end >= length { + let interval_start = (length as i32) - start; + let interval_end = end - start; + let bin_start = ((interval_start as f64) / bin_size) as usize; + let bin_end = ((interval_end as f64) / bin_size).ceil() as usize; + assert_eq!(bin_end, array.len() - 1); + for i in bin_start..bin_end { + array[i] = oob; + } + } + } + Ok(arr) } _ => { - let iter = b.get_interval(&chrom, intervals_start, intervals_end).convert_err()?; - Python::with_gil(|py| { - Ok(to_entry_array(start, end, iter) - .convert_err()? - .into_pyarray(py) - .to_object(py)) - }) + let arr = match arr { + Some(v) => v, + None => PyArray1::from_vec(py, vec![missing; (end - start) as usize]).to_object(py), + }; + let v: &PyArray1 = arr.downcast::>(py).map_err(|_| { + PyErr::new::( + "`arr` option must be a one-dimensional numpy array, if passed.", + ) + })?; + + { + let mut array = v.readwrite(); + + let iter = b + .get_interval(&chrom, intervals_start, intervals_end) + .convert_err()?; + to_entry_array(start, end, iter, array.as_array_mut()).convert_err()?; + + let mut array: ArrayViewMut<'_, f64, numpy::Ix1> = array.as_array_mut(); + + let bin_size = (end - start) as f64; + if start < 0 { + let bin_start = 0; + let interval_end = 0 - start; + let bin_end = ((interval_end as f64) / bin_size).ceil() as usize; + for i in bin_start..bin_end { + array[i] = oob; + } + } + if end >= length { + let interval_start = (length as i32) - start; + let interval_end = end - start; + let bin_start = ((interval_start as f64) / bin_size) as usize; + let bin_end = ((interval_end as f64) / bin_size).ceil() as usize; + assert_eq!(bin_end, array.len() - 1); + for i in bin_start..bin_end { + array[i] = oob; + } + } + } + Ok(arr) } } } @@ -280,18 +437,18 @@ fn to_array>>( start: i32, end: i32, iter: I, -) -> Result, BBIReadError> { - use numpy::ndarray::Array; - let mut v = vec![f64::NAN; (end - start) as usize]; + mut v: ArrayViewMut<'_, f64, numpy::Ix1>, +) -> Result<(), BBIReadError> { + assert_eq!(v.len(), (end - start) as usize); for interval in iter { let interval = interval?; let interval_start = ((interval.start as i32) - start) as usize; let interval_end = ((interval.end as i32) - start) as usize; - for i in v[interval_start..interval_end].iter_mut() { - *i = interval.value as f64; + for i in interval_start..interval_end { + *v.index_mut(i) += interval.value as f64; } } - Ok(Array::from(v)) + Ok(()) } fn to_array_bins>>( start: i32, @@ -299,50 +456,104 @@ fn to_array_bins>>( iter: I, summary: Summary, bins: usize, -) -> Result, BBIReadError> { - use numpy::ndarray::Array; - let mut v = match summary { - Summary::Min | Summary::Max => vec![f64::NAN; bins], - Summary::Mean => vec![0.0; bins], - }; + missing: f64, + mut v: ArrayViewMut<'_, f64, numpy::Ix1>, +) -> Result<(), BBIReadError> { + assert_eq!(v.len(), bins); + + let mut bin_data: VecDeque<(usize, i32, i32, Option)> = VecDeque::new(); let bin_size = (end - start) as f64 / bins as f64; for interval in iter { let interval = interval?; - let interval_start = (interval.start as i32) - start; - let interval_end = (interval.end as i32) - start; + let interval_start = (interval.start as i32).max(start) - start; + let interval_end = (interval.end as i32).min(end) - start; let bin_start = ((interval_start as f64) / bin_size) as usize; let bin_end = ((interval_end as f64) / bin_size).ceil() as usize; + + while let Some(front) = bin_data.front_mut() { + if front.0 < bin_start { + let front = bin_data.pop_front().unwrap(); + let bin = front.0; + + match summary { + Summary::Min => { + v[bin] = front.3.unwrap_or(missing); + } + Summary::Max => { + v[bin] = front.3.unwrap_or(missing); + } + Summary::Mean => { + v[bin] = front.3.map(|v| v / bin_size).unwrap_or(missing); + } + } + } else { + break; + } + } + while let Some(bin) = bin_data + .back_mut() + .map(|b| ((b.0 + 1) < bin_end).then(|| b.0 + 1)) + .unwrap_or(Some(bin_start)) + { + let bin_start = ((bin as f64) * bin_size).ceil() as i32; + let bin_end = (((bin + 1) as f64) * bin_size).ceil() as i32; + + bin_data.push_back((bin, bin_start, bin_end, None)); + } + for bin in bin_data.iter() { + assert!( + (bin_start..bin_end).contains(&bin.0), + "{} not in {}..{}", + bin.0, + bin_start, + bin_end + ); + } for bin in bin_start..bin_end { - let bin_start = (bin as f64) * bin_size; - let bin_end = (bin as f64 + 1.0) * bin_size; - let interval_start = (interval_start as f64).max(bin_start); - let interval_end = (interval_end as f64).min(bin_end); - // Possible overlaps - // bin |-----| |------| |-----| - // value |----| |----| |----| - // overlap ---- ------ ---- + assert!(bin_data.iter().find(|b| b.0 == bin).is_some()); + } + for (_bin, bin_start, bin_end, data) in bin_data.iter_mut() { + let v = data.get_or_insert_with(|| { + match summary { + // min & max are defined for NAN and we are about to set it + // can't use 0.0 because it may be either below or above the real value + Summary::Min | Summary::Max => f64::NAN, + // addition is not defined for NAN + Summary::Mean => 0.0, + } + }); match summary { Summary::Min => { - v[bin] = v[bin].min(interval.value as f64); + *v = v.min(interval.value as f64); } Summary::Max => { - v[bin] = v[bin].max(interval.value as f64); + *v = v.max(interval.value as f64); } Summary::Mean => { - let overlap_size = interval_end - interval_start; - v[bin] += (overlap_size as f64) * interval.value as f64; + let overlap_start = (*bin_start).max(interval_start); + let overlap_end = (*bin_end).min(interval_end); + let overlap_size: i32 = overlap_end - overlap_start; + *v += (overlap_size as f64) * interval.value as f64; } } } } - if let Summary::Mean = summary { - let last = v.last().copied().expect("Should be at least one bin."); - let last_size = (end - start) as f64 - (bins as f64 - 1.0) * bin_size; - v = v.into_iter().map(|v| v / bin_size as f64).collect(); - // The last bin could be smaller - *v.last_mut().expect("Should be at least one bin.") = last / last_size; + while let Some(front) = bin_data.pop_front() { + let bin = front.0; + + match summary { + Summary::Min => { + v[bin] = front.3.unwrap_or(missing); + } + Summary::Max => { + v[bin] = front.3.unwrap_or(missing); + } + Summary::Mean => { + v[bin] = front.3.map(|v| v / bin_size).unwrap_or(missing); + } + } } - Ok(Array::from(v)) + Ok(()) } fn to_array_zoom>>( start: i32, @@ -350,12 +561,12 @@ fn to_array_zoom>>( iter: I, summary: Summary, bins: usize, -) -> Result, BBIReadError> { - use numpy::ndarray::Array; - let mut v = match summary { - Summary::Min | Summary::Max => vec![f64::NAN; bins], - Summary::Mean => vec![0.0; bins], - }; + missing: f64, + mut v: ArrayViewMut<'_, f64, numpy::Ix1>, +) -> Result<(), BBIReadError> { + assert_eq!(v.len(), bins); + + let mut bin_data: VecDeque<(usize, i32, i32, Option)> = VecDeque::new(); let bin_size = (end - start) as f64 / bins as f64; for interval in iter { let interval = interval?; @@ -363,56 +574,119 @@ fn to_array_zoom>>( let interval_end = (interval.end as i32).min(end) - start; let bin_start = ((interval_start as f64) / bin_size) as usize; let bin_end = ((interval_end as f64) / bin_size).ceil() as usize; - for bin in bin_start..bin_end { - let bin_start = (bin as f64) * bin_size; - let bin_end = (bin as f64 + 1.0) * bin_size; - let interval_start = (interval_start as f64).max(bin_start); - let interval_end = (interval_end as f64).min(bin_end); - // Possible overlaps - // bin |-----| |------| |-----| - // value |----| |----| |----| - // overlap ---- ------ ---- - match summary { - Summary::Min => { - v[bin] = v[bin].min(interval.summary.min_val as f64); - } - Summary::Max => { - v[bin] = v[bin].max(interval.summary.max_val as f64); - } - Summary::Mean => { - let overlap_size = interval_end - interval_start; - let zoom_mean = - (interval.summary.sum as f64) / (interval.summary.bases_covered as f64); - v[bin] += (overlap_size as f64) * zoom_mean; + + while let Some(front) = bin_data.front_mut() { + if front.0 < bin_start { + let front = bin_data.pop_front().unwrap(); + let bin = front.0; + + match summary { + Summary::Min => { + v[bin] = front.3.unwrap_or(missing); + } + Summary::Max => { + v[bin] = front.3.unwrap_or(missing); + } + Summary::Mean => { + v[bin] = front.3.map(|v| v / bin_size).unwrap_or(missing); + } } + } else { + break; + } + } + while let Some(bin) = bin_data + .back_mut() + .map(|b| ((b.0 + 1) < bin_end).then(|| b.0 + 1)) + .unwrap_or(Some(bin_start)) + { + let bin_start = ((bin as f64) * bin_size).ceil() as i32; + let bin_end = (((bin + 1) as f64) * bin_size).ceil() as i32; + + bin_data.push_back((bin, bin_start, bin_end, None)); + } + for bin in bin_data.iter() { + assert!( + (bin_start..bin_end).contains(&bin.0), + "{} not in {}..{}", + bin.0, + bin_start, + bin_end + ); + } + for bin in bin_start..bin_end { + assert!(bin_data.iter().find(|b| b.0 == bin).is_some()); + } + for (_bin, bin_start, bin_end, data) in bin_data.iter_mut() { + match data.as_mut() { + Some(v) => match summary { + Summary::Min => { + *v = v.min(interval.summary.min_val as f64); + } + Summary::Max => { + *v = v.max(interval.summary.max_val as f64); + } + Summary::Mean => { + let overlap_start = (*bin_start).max(interval_start); + let overlap_end = (*bin_end).min(interval_end); + let overlap_size = overlap_end - overlap_start; + let zoom_mean = + (interval.summary.sum as f64) / (interval.summary.bases_covered as f64); + *v += (overlap_size as f64) * zoom_mean; + } + }, + None => match summary { + Summary::Min => { + *data = Some(interval.summary.min_val as f64); + } + Summary::Max => { + *data = Some(interval.summary.max_val as f64); + } + Summary::Mean => { + let overlap_start = (*bin_start).max(interval_start); + let overlap_end = (*bin_end).min(interval_end); + let overlap_size = overlap_end - overlap_start; + let zoom_mean = + (interval.summary.sum as f64) / (interval.summary.bases_covered as f64); + *data = Some((overlap_size as f64) * zoom_mean); + } + }, } } } - if let Summary::Mean = summary { - let last = v.last().copied().expect("Should be at least one bin."); - let last_size = (end - start) as f64 - (bins as f64 - 1.0) * bin_size; - v = v.into_iter().map(|v| v / bin_size as f64).collect(); - // The last bin could be smaller - *v.last_mut().expect("Should be at least one bin.") = last / last_size; + while let Some(front) = bin_data.pop_front() { + let bin = front.0; + + match summary { + Summary::Min => { + v[bin] = front.3.unwrap_or(missing); + } + Summary::Max => { + v[bin] = front.3.unwrap_or(missing); + } + Summary::Mean => { + v[bin] = front.3.map(|v| v / bin_size).unwrap_or(missing); + } + } } - Ok(Array::from(v)) + Ok(()) } fn to_entry_array>>( start: i32, end: i32, iter: I, -) -> Result, BBIReadError> { - use numpy::ndarray::Array; - let mut v = vec![0.0; (end - start) as usize]; + mut v: ArrayViewMut<'_, f64, numpy::Ix1>, +) -> Result<(), BBIReadError> { + assert_eq!(v.len(), (end - start) as usize); for interval in iter { let interval = interval?; let interval_start = ((interval.start as i32) - start) as usize; let interval_end = ((interval.end as i32) - start) as usize; - for i in v[interval_start..interval_end].iter_mut() { - *i += 1.0; + for i in interval_start..interval_end { + *v.index_mut(i) += 1.0; } } - Ok(Array::from(v)) + Ok(()) } fn to_entry_array_bins>>( start: i32, @@ -420,10 +694,11 @@ fn to_entry_array_bins>>( iter: I, summary: Summary, bins: usize, -) -> Result, BBIReadError> { - use numpy::ndarray::Array; + mut v: ArrayViewMut<'_, f64, numpy::Ix1>, +) -> Result<(), BBIReadError> { + assert_eq!(v.len(), bins); + let mut bin_data: VecDeque<(usize, i32, i32, Vec)> = VecDeque::new(); - let mut v = vec![0.0; bins]; let bin_size = (end - start) as f64 / bins as f64; for interval in iter { let interval = interval?; @@ -468,7 +743,12 @@ fn to_entry_array_bins>>( let bin_start = ((bin as f64) * bin_size).ceil() as i32; let bin_end = (((bin + 1) as f64) * bin_size).ceil() as i32; - bin_data.push_back((bin, bin_start, bin_end, vec![0.0; (bin_end - bin_start) as usize])); + bin_data.push_back(( + bin, + bin_start, + bin_end, + vec![0.0; (bin_end - bin_start) as usize], + )); } for bin in bin_data.iter() { assert!( @@ -486,7 +766,9 @@ fn to_entry_array_bins>>( let overlap_start = (*bin_start).max(interval_start); let overlap_end = (*bin_end).min(interval_end); - for i in &mut data[((overlap_start - *bin_start) as usize)..((overlap_end - *bin_start) as usize)] { + for i in &mut data + [((overlap_start - *bin_start) as usize)..((overlap_end - *bin_start) as usize)] + { *i = *i + 1.0; } } @@ -514,7 +796,7 @@ fn to_entry_array_bins>>( } } } - Ok(Array::from(v)) + Ok(()) } fn to_entry_array_zoom>>( @@ -523,10 +805,11 @@ fn to_entry_array_zoom>>( iter: I, summary: Summary, bins: usize, -) -> Result, BBIReadError> { - use numpy::ndarray::Array; + mut v: ArrayViewMut<'_, f64, numpy::Ix1>, +) -> Result<(), BBIReadError> { + assert_eq!(v.len(), bins); + let mut bin_data: VecDeque<(usize, i32, i32, Vec)> = VecDeque::new(); - let mut v = vec![0.0; bins]; let bin_size = (end - start) as f64 / bins as f64; for interval in iter { let interval = interval?; @@ -573,7 +856,12 @@ fn to_entry_array_zoom>>( let bin_start = ((bin as f64) * bin_size).ceil() as i32; let bin_end = (((bin + 1) as f64) * bin_size).ceil() as i32; - bin_data.push_back((bin, bin_start, bin_end, vec![f64::NAN; (bin_end - bin_start) as usize])); + bin_data.push_back(( + bin, + bin_start, + bin_end, + vec![f64::NAN; (bin_end - bin_start) as usize], + )); } for bin in bin_data.iter() { assert!( @@ -592,7 +880,9 @@ fn to_entry_array_zoom>>( let overlap_end = (*bin_end).min(interval_end); let mean = interval.summary.sum / (interval.end - interval.start) as f64; - for i in &mut data[((overlap_start - *bin_start) as usize)..((overlap_end - *bin_start) as usize)] { + for i in &mut data + [((overlap_start - *bin_start) as usize)..((overlap_end - *bin_start) as usize)] + { match summary { Summary::Mean => *i += mean, Summary::Min => *i = i.min(interval.summary.min_val), @@ -627,7 +917,7 @@ fn to_entry_array_zoom>>( } } } - Ok(Array::from(v)) + Ok(()) } /// This class is the interface for reading a bigWig or bigBed. @@ -830,6 +1120,7 @@ impl BBIRead { #[pyo3(signature = (chrom, start, end, bins=None, summary="mean".to_string(), exact=false, missing=0.0, oob=f64::NAN, arr=None))] fn values( &mut self, + py: Python<'_>, chrom: String, start: Option, end: Option, @@ -852,24 +1143,24 @@ impl BBIRead { }; match &mut self.bbi { BBIReadRaw::BigWigFile(b) => intervals_to_array( - b, &chrom, start, end, bins, summary, exact, missing, oob, arr, + py, b, &chrom, start, end, bins, summary, exact, missing, oob, arr, ), #[cfg(feature = "remote")] BBIReadRaw::BigWigRemote(b) => intervals_to_array( - b, &chrom, start, end, bins, summary, exact, missing, oob, arr, + py, b, &chrom, start, end, bins, summary, exact, missing, oob, arr, ), BBIReadRaw::BigWigFileLike(b) => intervals_to_array( - b, &chrom, start, end, bins, summary, exact, missing, oob, arr, + py, b, &chrom, start, end, bins, summary, exact, missing, oob, arr, ), BBIReadRaw::BigBedFile(b) => entries_to_array( - b, &chrom, start, end, bins, summary, exact, missing, oob, arr, + py, b, &chrom, start, end, bins, summary, exact, missing, oob, arr, ), #[cfg(feature = "remote")] BBIReadRaw::BigBedRemote(b) => entries_to_array( - b, &chrom, start, end, bins, summary, exact, missing, oob, arr, + py, b, &chrom, start, end, bins, summary, exact, missing, oob, arr, ), BBIReadRaw::BigBedFileLike(b) => entries_to_array( - b, &chrom, start, end, bins, summary, exact, missing, oob, arr, + py, b, &chrom, start, end, bins, summary, exact, missing, oob, arr, ), } } @@ -1627,6 +1918,7 @@ fn pybigtools(_py: Python, m: &PyModule) -> PyResult<()> { #[cfg(test)] mod test { use bigtools::BedEntry; + use numpy::ndarray::Array; use crate::to_entry_array_bins; @@ -1637,15 +1929,17 @@ mod test { end: 20, rest: "".to_string(), }]; - let res = to_entry_array_bins( + let mut arr = Array::from(vec![f64::NAN]); + to_entry_array_bins( 10, 20, entries.into_iter().map(|v| Ok(v)), crate::Summary::Mean, 1, + arr.view_mut(), ) .unwrap(); - let res = res.to_vec(); + let res = arr.to_vec(); assert_eq!(res, [1.0].into_iter().collect::>()); let entries = [BedEntry { @@ -1653,15 +1947,17 @@ mod test { end: 20, rest: "".to_string(), }]; - let res = to_entry_array_bins( + let mut arr = Array::from(vec![f64::NAN, f64::NAN]); + to_entry_array_bins( 10, 20, entries.into_iter().map(|v| Ok(v)), crate::Summary::Mean, 2, + arr.view_mut(), ) .unwrap(); - let res = res.to_vec(); + let res = arr.to_vec(); assert_eq!(res, [1.0, 1.0].into_iter().collect::>()); let entries = [ @@ -1676,35 +1972,41 @@ mod test { rest: "".to_string(), }, ]; - let res = to_entry_array_bins( + let mut arr = Array::from(vec![f64::NAN, f64::NAN]); + to_entry_array_bins( 10, 20, entries.clone().into_iter().map(|v| Ok(v)), crate::Summary::Mean, 2, + arr.view_mut(), ) .unwrap(); - let res = res.to_vec(); + let res = arr.to_vec(); assert_eq!(res, [1.0, 2.0].into_iter().collect::>()); - let res = to_entry_array_bins( + let mut arr = Array::from(vec![f64::NAN]); + to_entry_array_bins( 10, 20, entries.clone().into_iter().map(|v| Ok(v)), crate::Summary::Min, 1, + arr.view_mut(), ) .unwrap(); - let res = res.to_vec(); + let res = arr.to_vec(); assert_eq!(res, [1.0].into_iter().collect::>()); - let res = to_entry_array_bins( + let mut arr = Array::from(vec![f64::NAN]); + to_entry_array_bins( 10, 20, entries.clone().into_iter().map(|v| Ok(v)), crate::Summary::Max, 1, + arr.view_mut(), ) .unwrap(); - let res = res.to_vec(); + let res = arr.to_vec(); assert_eq!(res, [2.0].into_iter().collect::>()); let entries = [ @@ -1724,35 +2026,41 @@ mod test { rest: "".to_string(), }, ]; - let res = to_entry_array_bins( + let mut arr = Array::from(vec![f64::NAN, f64::NAN]); + to_entry_array_bins( 10, 20, entries.clone().into_iter().map(|v| Ok(v)), crate::Summary::Mean, 2, + arr.view_mut(), ) .unwrap(); - let res = res.to_vec(); + let res = arr.to_vec(); assert_eq!(res, [1.0, 3.0].into_iter().collect::>()); - let res = to_entry_array_bins( + let mut arr = Array::from(vec![f64::NAN]); + to_entry_array_bins( 10, 20, entries.clone().into_iter().map(|v| Ok(v)), crate::Summary::Min, 1, + arr.view_mut(), ) .unwrap(); - let res = res.to_vec(); + let res = arr.to_vec(); assert_eq!(res, [1.0].into_iter().collect::>()); - let res = to_entry_array_bins( + let mut arr = Array::from(vec![f64::NAN]); + to_entry_array_bins( 10, 20, entries.clone().into_iter().map(|v| Ok(v)), crate::Summary::Max, 1, + arr.view_mut(), ) .unwrap(); - let res = res.to_vec(); + let res = arr.to_vec(); assert_eq!(res, [3.0].into_iter().collect::>()); } } From 1708ab15b026c1bb41e7f56a9006c2bea88bc6bc Mon Sep 17 00:00:00 2001 From: Jack Huey <31162821+jackh726@users.noreply.github.com> Date: Wed, 27 Mar 2024 17:42:46 -0400 Subject: [PATCH 09/30] Cleanup and some fixes --- pybigtools/src/lib.rs | 491 +++++++++++++++++++++++------------------- 1 file changed, 271 insertions(+), 220 deletions(-) diff --git a/pybigtools/src/lib.rs b/pybigtools/src/lib.rs index 32bf659..88cbb8a 100644 --- a/pybigtools/src/lib.rs +++ b/pybigtools/src/lib.rs @@ -179,127 +179,114 @@ fn intervals_to_array( } else { None }; + let arr = match (bins, arr) { + (_, Some(arr)) => arr, + (Some(bins), None) => PyArray1::from_vec(py, vec![missing; bins]).to_object(py), + (None, None) => PyArray1::from_vec(py, vec![missing; (end - start) as usize]).to_object(py), + }; + let v: &PyArray1 = arr.downcast::>(py).map_err(|_| { + PyErr::new::( + "`arr` option must be a one-dimensional numpy array, if passed.", + ) + })?; let (intervals_start, intervals_end) = (start.max(0) as u32, end.min(length) as u32); - match bins { + let bin_size = match bins { Some(bins) => { - let arr = match arr { - Some(v) => v, - None => PyArray1::from_vec(py, vec![missing; bins]).to_object(py), - }; - let v: &PyArray1 = arr.downcast::>(py).map_err(|_| { - PyErr::new::( - "`arr` option must be a one-dimensional numpy array, if passed.", - ) - })?; - { - let mut array = v.readwrite(); - - match zoom { - Some(zoom) => { - let iter = b - .get_zoom_interval( - &chrom, - intervals_start, - intervals_end, - zoom.reduction_level, - ) - .convert_err()?; - to_array_zoom( - start, - end, - iter, - summary, - bins, - missing, - array.as_array_mut(), - ) - .convert_err()?; - } - None => { - let iter = b - .get_interval(&chrom, intervals_start, intervals_end) - .convert_err()?; - to_array_bins( - start, - end, - iter, - summary, - bins, - missing, - array.as_array_mut(), - ) - .convert_err()?; - } - }; + if v.len() != bins { + return Err(PyErr::new::(format!( + "`arr` does not the expected size (expected `{}`, found `{}`), if passed.", + bins, + v.len(), + ))); + } - let mut array: ArrayViewMut<'_, f64, numpy::Ix1> = array.as_array_mut(); + let mut array = v.readwrite(); - let bin_size = (end - start) as f64 / bins as f64; - if start < 0 { - let bin_start = 0; - let interval_end = 0 - start; - let bin_end = ((interval_end as f64) / bin_size).ceil() as usize; - for i in bin_start..bin_end { - array[i] = oob; - } + match zoom { + Some(zoom) => { + let iter = b + .get_zoom_interval( + &chrom, + intervals_start, + intervals_end, + zoom.reduction_level, + ) + .convert_err()?; + to_array_zoom( + start, + end, + iter, + summary, + bins, + missing, + array.as_array_mut(), + ) + .convert_err()?; } - if end >= length { - let interval_start = (length as i32) - start; - let interval_end = end - start; - let bin_start = ((interval_start as f64) / bin_size) as usize; - let bin_end = ((interval_end as f64) / bin_size).ceil() as usize; - assert_eq!(bin_end, array.len() - 1); - for i in bin_start..bin_end { - array[i] = oob; - } + None => { + let iter = b + .get_interval(&chrom, intervals_start, intervals_end) + .convert_err()?; + to_array_bins( + start, + end, + iter, + summary, + bins, + missing, + array.as_array_mut(), + ) + .convert_err()?; } - } - Ok(arr) + }; + + (end - start) as f64 / bins as f64 } _ => { - let arr = match arr { - Some(v) => v, - None => PyArray1::from_vec(py, vec![missing; (end - start) as usize]).to_object(py), - }; - let v: &PyArray1 = arr.downcast::>(py).map_err(|_| { - PyErr::new::( - "`arr` option must be a one-dimensional numpy array, if passed.", - ) - })?; + if v.len() != (end - start) as usize { + return Err(PyErr::new::(format!( + "`arr` does not the expected size (expected `{}`, found `{}`), if passed.", + (end - start) as usize, + v.len(), + ))); + } - { - let mut array = v.readwrite(); + let mut array = v.readwrite(); - let iter = b - .get_interval(&chrom, intervals_start, intervals_end) - .convert_err()?; - to_array(start, end, iter, array.as_array_mut()).convert_err()?; + let iter = b + .get_interval(&chrom, intervals_start, intervals_end) + .convert_err()?; + to_array(start, end, iter, missing, array.as_array_mut()).convert_err()?; - let mut array: ArrayViewMut<'_, f64, numpy::Ix1> = array.as_array_mut(); + (end - start) as f64 + } + }; - let bin_size = (end - start) as f64; - if start < 0 { - let bin_start = 0; - let interval_end = 0 - start; - let bin_end = ((interval_end as f64) / bin_size).ceil() as usize; - for i in bin_start..bin_end { - array[i] = oob; - } - } - if end >= length { - let interval_start = (length as i32) - start; - let interval_end = end - start; - let bin_start = ((interval_start as f64) / bin_size) as usize; - let bin_end = ((interval_end as f64) / bin_size).ceil() as usize; - assert_eq!(bin_end, array.len() - 1); - for i in bin_start..bin_end { - array[i] = oob; - } - } + { + let mut array = v.readwrite(); + let mut array: ArrayViewMut<'_, f64, numpy::Ix1> = array.as_array_mut(); + + if start < 0 { + let bin_start = 0; + let interval_end = 0 - start; + let bin_end = ((interval_end as f64) / bin_size).ceil() as usize; + for i in bin_start..bin_end { + array[i] = oob; + } + } + if end >= length { + let interval_start = (length as i32) - start; + let interval_end = end - start; + let bin_start = ((interval_start as f64) / bin_size) as usize; + let bin_end = ((interval_end as f64) / bin_size).ceil() as usize; + assert_eq!(bin_end, array.len() - 1); + for i in bin_start..bin_end { + array[i] = oob; } - Ok(arr) } } + + Ok(arr) } fn entries_to_array( py: Python<'_>, @@ -327,119 +314,125 @@ fn entries_to_array( } else { None }; + let arr = match (bins, arr) { + (_, Some(arr)) => arr, + (Some(bins), None) => PyArray1::from_vec(py, vec![missing; bins]).to_object(py), + (None, None) => PyArray1::from_vec(py, vec![missing; (end - start) as usize]).to_object(py), + }; + let v: &PyArray1 = arr.downcast::>(py).map_err(|_| { + PyErr::new::( + "`arr` option must be a one-dimensional numpy array, if passed.", + ) + })?; let (intervals_start, intervals_end) = (start.max(0) as u32, end.min(length) as u32); - match bins { + let bin_size = match bins { Some(bins) => { - let arr = match arr { - Some(v) => v, - None => PyArray1::from_vec(py, vec![missing; bins]).to_object(py), - }; - let v: &PyArray1 = arr.downcast::>(py).map_err(|_| { - PyErr::new::( - "`arr` option must be a one-dimensional numpy array, if passed.", - ) - })?; - { - let mut array = v.readwrite(); - - match zoom { - Some(zoom) => { - let iter = b - .get_zoom_interval( - &chrom, - intervals_start, - intervals_end, - zoom.reduction_level, - ) - .convert_err()?; - to_entry_array_zoom(start, end, iter, summary, bins, array.as_array_mut()) - .convert_err()?; - } - None => { - let iter = b - .get_interval(&chrom, intervals_start, intervals_end) - .convert_err()?; - to_entry_array_bins(start, end, iter, summary, bins, array.as_array_mut()) - .convert_err()?; - } - }; + if v.len() != bins { + return Err(PyErr::new::(format!( + "`arr` does not the expected size (expected `{}`, found `{}`), if passed.", + bins, + v.len(), + ))); + } - let mut array: ArrayViewMut<'_, f64, numpy::Ix1> = array.as_array_mut(); + let mut array = v.readwrite(); - let bin_size = (end - start) as f64 / bins as f64; - if start < 0 { - let bin_start = 0; - let interval_end = 0 - start; - let bin_end = ((interval_end as f64) / bin_size).ceil() as usize; - for i in bin_start..bin_end { - array[i] = oob; - } + match zoom { + Some(zoom) => { + let iter = b + .get_zoom_interval( + &chrom, + intervals_start, + intervals_end, + zoom.reduction_level, + ) + .convert_err()?; + to_entry_array_zoom( + start, + end, + iter, + summary, + bins, + missing, + array.as_array_mut(), + ) + .convert_err()?; } - if end >= length { - let interval_start = (length as i32) - start; - let interval_end = end - start; - let bin_start = ((interval_start as f64) / bin_size) as usize; - let bin_end = ((interval_end as f64) / bin_size).ceil() as usize; - assert_eq!(bin_end, array.len() - 1); - for i in bin_start..bin_end { - array[i] = oob; - } + None => { + let iter = b + .get_interval(&chrom, intervals_start, intervals_end) + .convert_err()?; + to_entry_array_bins( + start, + end, + iter, + summary, + bins, + missing, + array.as_array_mut(), + ) + .convert_err()?; } - } - Ok(arr) + }; + + (end - start) as f64 / bins as f64 } _ => { - let arr = match arr { - Some(v) => v, - None => PyArray1::from_vec(py, vec![missing; (end - start) as usize]).to_object(py), - }; - let v: &PyArray1 = arr.downcast::>(py).map_err(|_| { - PyErr::new::( - "`arr` option must be a one-dimensional numpy array, if passed.", - ) - })?; + if v.len() != (end - start) as usize { + return Err(PyErr::new::(format!( + "`arr` does not the expected size (expected `{}`, found `{}`), if passed.", + (end - start) as usize, + v.len(), + ))); + } - { - let mut array = v.readwrite(); + let mut array = v.readwrite(); - let iter = b - .get_interval(&chrom, intervals_start, intervals_end) - .convert_err()?; - to_entry_array(start, end, iter, array.as_array_mut()).convert_err()?; + let iter = b + .get_interval(&chrom, intervals_start, intervals_end) + .convert_err()?; + to_entry_array(start, end, iter, missing, array.as_array_mut()).convert_err()?; - let mut array: ArrayViewMut<'_, f64, numpy::Ix1> = array.as_array_mut(); + (end - start) as f64 + } + }; - let bin_size = (end - start) as f64; - if start < 0 { - let bin_start = 0; - let interval_end = 0 - start; - let bin_end = ((interval_end as f64) / bin_size).ceil() as usize; - for i in bin_start..bin_end { - array[i] = oob; - } - } - if end >= length { - let interval_start = (length as i32) - start; - let interval_end = end - start; - let bin_start = ((interval_start as f64) / bin_size) as usize; - let bin_end = ((interval_end as f64) / bin_size).ceil() as usize; - assert_eq!(bin_end, array.len() - 1); - for i in bin_start..bin_end { - array[i] = oob; - } - } + { + let mut array = v.readwrite(); + let mut array: ArrayViewMut<'_, f64, numpy::Ix1> = array.as_array_mut(); + + if start < 0 { + let bin_start = 0; + let interval_end = 0 - start; + let bin_end = ((interval_end as f64) / bin_size).ceil() as usize; + for i in bin_start..bin_end { + array[i] = oob; + } + } + if end >= length { + let interval_start = (length as i32) - start; + let interval_end = end - start; + let bin_start = ((interval_start as f64) / bin_size) as usize; + let bin_end = ((interval_end as f64) / bin_size).ceil() as usize; + assert_eq!(bin_end, array.len() - 1); + for i in bin_start..bin_end { + array[i] = oob; } - Ok(arr) } } + + Ok(arr) } + fn to_array>>( start: i32, end: i32, iter: I, + missing: f64, mut v: ArrayViewMut<'_, f64, numpy::Ix1>, ) -> Result<(), BBIReadError> { assert_eq!(v.len(), (end - start) as usize); + v.fill(missing); for interval in iter { let interval = interval?; let interval_start = ((interval.start as i32) - start) as usize; @@ -460,6 +453,7 @@ fn to_array_bins>>( mut v: ArrayViewMut<'_, f64, numpy::Ix1>, ) -> Result<(), BBIReadError> { assert_eq!(v.len(), bins); + v.fill(missing); let mut bin_data: VecDeque<(usize, i32, i32, Option)> = VecDeque::new(); let bin_size = (end - start) as f64 / bins as f64; @@ -565,6 +559,7 @@ fn to_array_zoom>>( mut v: ArrayViewMut<'_, f64, numpy::Ix1>, ) -> Result<(), BBIReadError> { assert_eq!(v.len(), bins); + v.fill(missing); let mut bin_data: VecDeque<(usize, i32, i32, Option)> = VecDeque::new(); let bin_size = (end - start) as f64 / bins as f64; @@ -675,9 +670,11 @@ fn to_entry_array>>( start: i32, end: i32, iter: I, + missing: f64, mut v: ArrayViewMut<'_, f64, numpy::Ix1>, ) -> Result<(), BBIReadError> { assert_eq!(v.len(), (end - start) as usize); + v.fill(missing); for interval in iter { let interval = interval?; let interval_start = ((interval.start as i32) - start) as usize; @@ -694,9 +691,11 @@ fn to_entry_array_bins>>( iter: I, summary: Summary, bins: usize, + missing: f64, mut v: ArrayViewMut<'_, f64, numpy::Ix1>, ) -> Result<(), BBIReadError> { assert_eq!(v.len(), bins); + v.fill(missing); let mut bin_data: VecDeque<(usize, i32, i32, Vec)> = VecDeque::new(); let bin_size = (end - start) as f64 / bins as f64; @@ -718,14 +717,14 @@ fn to_entry_array_bins>>( .3 .into_iter() .reduce(|min, v| min.min(v)) - .unwrap_or(0.0); + .unwrap_or(missing); } Summary::Max => { v[bin] = front .3 .into_iter() .reduce(|max, v| max.max(v)) - .unwrap_or(0.0); + .unwrap_or(missing); } Summary::Mean => { v[bin] = front.3.into_iter().sum::() / bin_size; @@ -747,7 +746,7 @@ fn to_entry_array_bins>>( bin, bin_start, bin_end, - vec![0.0; (bin_end - bin_start) as usize], + vec![missing; (bin_end - bin_start) as usize], )); } for bin in bin_data.iter() { @@ -769,7 +768,8 @@ fn to_entry_array_bins>>( for i in &mut data [((overlap_start - *bin_start) as usize)..((overlap_end - *bin_start) as usize)] { - *i = *i + 1.0; + // If NAN, then 0.0 + 1.0, else i + 1.0 + *i = (*i).max(0.0) + 1.0; } } } @@ -782,14 +782,14 @@ fn to_entry_array_bins>>( .3 .into_iter() .reduce(|min, v| min.min(v)) - .unwrap_or(0.0); + .unwrap_or(missing); } Summary::Max => { v[bin] = front .3 .into_iter() .reduce(|max, v| max.max(v)) - .unwrap_or(0.0); + .unwrap_or(missing); } Summary::Mean => { v[bin] = front.3.into_iter().sum::() / bin_size; @@ -805,9 +805,11 @@ fn to_entry_array_zoom>>( iter: I, summary: Summary, bins: usize, + missing: f64, mut v: ArrayViewMut<'_, f64, numpy::Ix1>, ) -> Result<(), BBIReadError> { assert_eq!(v.len(), bins); + v.fill(missing); let mut bin_data: VecDeque<(usize, i32, i32, Vec)> = VecDeque::new(); let bin_size = (end - start) as f64 / bins as f64; @@ -829,16 +831,14 @@ fn to_entry_array_zoom>>( .3 .into_iter() .reduce(|min, v| min.min(v)) - .unwrap_or(0.0) - .max(0.0); + .unwrap_or(missing); } Summary::Max => { v[bin] = front .3 .into_iter() .reduce(|max, v| max.max(v)) - .unwrap_or(0.0) - .max(0.0); + .unwrap_or(missing); } Summary::Mean => { v[bin] = front.3.into_iter().sum::() / bin_size; @@ -860,7 +860,7 @@ fn to_entry_array_zoom>>( bin, bin_start, bin_end, - vec![f64::NAN; (bin_end - bin_start) as usize], + vec![missing; (bin_end - bin_start) as usize], )); } for bin in bin_data.iter() { @@ -883,12 +883,12 @@ fn to_entry_array_zoom>>( for i in &mut data [((overlap_start - *bin_start) as usize)..((overlap_end - *bin_start) as usize)] { + *i = i.max(0.0); match summary { Summary::Mean => *i += mean, Summary::Min => *i = i.min(interval.summary.min_val), Summary::Max => *i = i.max(interval.summary.max_val), } - *i = i.max(0.0); } } } @@ -901,16 +901,14 @@ fn to_entry_array_zoom>>( .3 .into_iter() .reduce(|min, v| min.min(v)) - .unwrap_or(0.0) - .max(0.0); + .unwrap_or(missing); } Summary::Max => { v[bin] = front .3 .into_iter() .reduce(|max, v| max.max(v)) - .unwrap_or(0.0) - .max(0.0); + .unwrap_or(missing); } Summary::Mean => { v[bin] = front.3.into_iter().sum::() / bin_size; @@ -1924,6 +1922,51 @@ mod test { #[test] fn test_to_entry_array_bins() { + fn eq_with_nan_eq(a: f64, b: f64) -> bool { + (a.is_nan() && b.is_nan()) || (a == b) + } + + #[track_caller] + fn assert_equal(found: Vec, expected: Vec) { + let equal = (found.len() == expected.len()) && // zip stops at the shortest + found.iter() + .zip(expected.iter()) + .all(|(a,b)| eq_with_nan_eq(*a, *b)); + if !equal { + panic!("Vecs not equal: expected {:?}, found {:?}", expected, found); + } + } + + let entries = []; + let mut arr = Array::from(vec![f64::NAN]); + to_entry_array_bins( + 10, + 20, + entries.into_iter().map(|v| Ok(v)), + crate::Summary::Mean, + 1, + f64::NAN, + arr.view_mut(), + ) + .unwrap(); + let res = arr.to_vec(); + assert_equal(res, [f64::NAN].into_iter().collect::>()); + + let entries = []; + let mut arr = Array::from(vec![f64::NAN]); + to_entry_array_bins( + 10, + 20, + entries.into_iter().map(|v| Ok(v)), + crate::Summary::Mean, + 1, + 0.0, + arr.view_mut(), + ) + .unwrap(); + let res = arr.to_vec(); + assert_equal(res, [0.0].into_iter().collect::>()); + let entries = [BedEntry { start: 10, end: 20, @@ -1936,11 +1979,12 @@ mod test { entries.into_iter().map(|v| Ok(v)), crate::Summary::Mean, 1, + f64::NAN, arr.view_mut(), ) .unwrap(); let res = arr.to_vec(); - assert_eq!(res, [1.0].into_iter().collect::>()); + assert_equal(res, [1.0].into_iter().collect::>()); let entries = [BedEntry { start: 10, @@ -1954,11 +1998,12 @@ mod test { entries.into_iter().map(|v| Ok(v)), crate::Summary::Mean, 2, + f64::NAN, arr.view_mut(), ) .unwrap(); let res = arr.to_vec(); - assert_eq!(res, [1.0, 1.0].into_iter().collect::>()); + assert_equal(res, [1.0, 1.0].into_iter().collect::>()); let entries = [ BedEntry { @@ -1979,11 +2024,12 @@ mod test { entries.clone().into_iter().map(|v| Ok(v)), crate::Summary::Mean, 2, + f64::NAN, arr.view_mut(), ) .unwrap(); let res = arr.to_vec(); - assert_eq!(res, [1.0, 2.0].into_iter().collect::>()); + assert_equal(res, [1.0, 2.0].into_iter().collect::>()); let mut arr = Array::from(vec![f64::NAN]); to_entry_array_bins( 10, @@ -1991,11 +2037,12 @@ mod test { entries.clone().into_iter().map(|v| Ok(v)), crate::Summary::Min, 1, + f64::NAN, arr.view_mut(), ) .unwrap(); let res = arr.to_vec(); - assert_eq!(res, [1.0].into_iter().collect::>()); + assert_equal(res, [1.0].into_iter().collect::>()); let mut arr = Array::from(vec![f64::NAN]); to_entry_array_bins( 10, @@ -2003,11 +2050,12 @@ mod test { entries.clone().into_iter().map(|v| Ok(v)), crate::Summary::Max, 1, + f64::NAN, arr.view_mut(), ) .unwrap(); let res = arr.to_vec(); - assert_eq!(res, [2.0].into_iter().collect::>()); + assert_equal(res, [2.0].into_iter().collect::>()); let entries = [ BedEntry { @@ -2033,11 +2081,12 @@ mod test { entries.clone().into_iter().map(|v| Ok(v)), crate::Summary::Mean, 2, + f64::NAN, arr.view_mut(), ) .unwrap(); let res = arr.to_vec(); - assert_eq!(res, [1.0, 3.0].into_iter().collect::>()); + assert_equal(res, [1.0, 3.0].into_iter().collect::>()); let mut arr = Array::from(vec![f64::NAN]); to_entry_array_bins( 10, @@ -2045,11 +2094,12 @@ mod test { entries.clone().into_iter().map(|v| Ok(v)), crate::Summary::Min, 1, + f64::NAN, arr.view_mut(), ) .unwrap(); let res = arr.to_vec(); - assert_eq!(res, [1.0].into_iter().collect::>()); + assert_equal(res, [1.0].into_iter().collect::>()); let mut arr = Array::from(vec![f64::NAN]); to_entry_array_bins( 10, @@ -2057,10 +2107,11 @@ mod test { entries.clone().into_iter().map(|v| Ok(v)), crate::Summary::Max, 1, + f64::NAN, arr.view_mut(), ) .unwrap(); let res = arr.to_vec(); - assert_eq!(res, [3.0].into_iter().collect::>()); + assert_equal(res, [3.0].into_iter().collect::>()); } } From a4166bbf1abc2c5f072c09564868a900e0e54a3d Mon Sep 17 00:00:00 2001 From: Jack Huey <31162821+jackh726@users.noreply.github.com> Date: Sun, 31 Mar 2024 19:26:55 -0400 Subject: [PATCH 10/30] Add info fn --- bigtools/src/bbi/bbiread.rs | 10 +++++++ pybigtools/src/lib.rs | 59 +++++++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+) diff --git a/bigtools/src/bbi/bbiread.rs b/bigtools/src/bbi/bbiread.rs index 4a02fe2..8ef5079 100644 --- a/bigtools/src/bbi/bbiread.rs +++ b/bigtools/src/bbi/bbiread.rs @@ -54,6 +54,16 @@ pub struct BBIHeader { pub(crate) uncompress_buf_size: u32, } +impl BBIHeader { + pub fn is_compressed(&self) -> bool { + self.uncompress_buf_size > 0 + } + + pub fn primary_data_size(&self) -> u64 { + self.full_index_offset - self.full_data_offset + } +} + /// Information on a chromosome in a bbi file #[derive(Clone, Debug)] pub struct ChromInfo { diff --git a/pybigtools/src/lib.rs b/pybigtools/src/lib.rs index 88cbb8a..c9919b5 100644 --- a/pybigtools/src/lib.rs +++ b/pybigtools/src/lib.rs @@ -964,6 +964,65 @@ impl BBIRead { } } + fn info(&mut self, py: Python<'_>) -> PyResult { + let (info, summary) = match &mut self.bbi { + BBIReadRaw::BigWigFile(b) => { + let summary = b.get_summary()?; + (b.info(), summary) + } + BBIReadRaw::BigWigRemote(b) => { + let summary = b.get_summary()?; + (b.info(), summary) + } + BBIReadRaw::BigWigFileLike(b) => { + let summary = b.get_summary()?; + (b.info(), summary) + } + BBIReadRaw::BigBedFile(b) => { + let summary = b.get_summary()?; + (b.info(), summary) + } + BBIReadRaw::BigBedRemote(b) => { + let summary = b.get_summary()?; + (b.info(), summary) + } + BBIReadRaw::BigBedFileLike(b) => { + let summary = b.get_summary()?; + (b.info(), summary) + } + }; + let var = (summary.sum_squares + - (summary.sum * summary.sum) / summary.bases_covered as f64) + / (summary.bases_covered as f64 - 1.0); + let summary = [ + ("basesCovered", summary.bases_covered.to_object(py)), + ("sum", summary.sum.to_object(py)), + ( + "mean", + (summary.sum as f64 / summary.bases_covered as f64).to_object(py), + ), + ("min", summary.min_val.to_object(py)), + ("max", summary.max_val.to_object(py)), + ("std", f64::sqrt(var).to_object(py)), + ] + .into_py_dict(py) + .to_object(py); + let info = [ + ("version", info.header.version.to_object(py)), + ("isCompressed", info.header.is_compressed().to_object(py)), + ( + "primaryDataSize", + info.header.primary_data_size().to_object(py), + ), + ("zoomLevels", info.zoom_headers.len().to_object(py)), + ("chromCount", info.chrom_info.len().to_object(py)), + ("summary", summary), + ] + .into_py_dict(py) + .to_object(py); + Ok(info) + } + /// Returns the autosql of this bbi file. /// /// For bigBeds, this comes directly from the autosql stored in the file. From 6fc14fdda51dd0a1577d33f19d7e0ee3f6249aad Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Mon, 1 Apr 2024 20:03:45 -0400 Subject: [PATCH 11/30] Make is_bigwig, is_bigbed properties --- pybigtools/src/lib.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pybigtools/src/lib.rs b/pybigtools/src/lib.rs index c9919b5..37c6169 100644 --- a/pybigtools/src/lib.rs +++ b/pybigtools/src/lib.rs @@ -926,6 +926,7 @@ struct BBIRead { #[pymethods] impl BBIRead { + #[getter] fn is_bigwig(&self) -> bool { #[cfg(feature = "remote")] { @@ -945,6 +946,7 @@ impl BBIRead { } } + #[getter] fn is_bigbed(&self) -> bool { #[cfg(feature = "remote")] { From 84a73d5f4f027ed8e0ba4afbe6efc295cb62bae4 Mon Sep 17 00:00:00 2001 From: Jack Huey <31162821+jackh726@users.noreply.github.com> Date: Wed, 3 Apr 2024 23:00:48 -0400 Subject: [PATCH 12/30] Add zoom_records function --- bigtools/src/bbi/bbiread.rs | 31 +++++--- bigtools/src/bbi/bigbedread.rs | 23 ++++++ bigtools/src/bbi/bigwigread.rs | 22 ++++++ pybigtools/src/lib.rs | 134 +++++++++++++++++++++++++++++++-- 4 files changed, 192 insertions(+), 18 deletions(-) diff --git a/bigtools/src/bbi/bbiread.rs b/bigtools/src/bbi/bbiread.rs index 8ef5079..891e513 100644 --- a/bigtools/src/bbi/bbiread.rs +++ b/bigtools/src/bbi/bbiread.rs @@ -1,3 +1,4 @@ +use std::borrow::BorrowMut; use std::collections::hash_map::Entry; use std::collections::{HashMap, VecDeque}; use std::fs::File; @@ -1318,7 +1319,7 @@ pub(crate) fn get_zoom_block_values( chrom: u32, start: u32, end: u32, -) -> Result + Send>, BBIReadError> { +) -> Result, BBIReadError> { let (read, info) = bbifile.reader_and_info(); let data = read.get_block_data(info, &block)?; let mut bytes = BytesMut::with_capacity(data.len()); @@ -1389,30 +1390,34 @@ pub(crate) fn get_zoom_block_values( } *known_offset = block.offset + block.size; - Ok(Box::new(records.into_iter())) + Ok(records.into_iter()) } -pub(crate) struct ZoomIntervalIter<'a, I, B> +pub(crate) struct ZoomIntervalIter where I: Iterator + Send, - B: BBIRead, + R: BBIRead, + B: BorrowMut, { - bbifile: &'a mut B, + _r: std::marker::PhantomData, + bbifile: B, known_offset: u64, blocks: I, - vals: Option + Send + 'a>>, + vals: Option>, chrom: u32, start: u32, end: u32, } -impl<'a, I, B> ZoomIntervalIter<'a, I, B> +impl ZoomIntervalIter where I: Iterator + Send, - B: BBIRead, + R: BBIRead, + B: BorrowMut, { - pub fn new(bbifile: &'a mut B, blocks: I, chrom: u32, start: u32, end: u32) -> Self { + pub fn new(bbifile: B, blocks: I, chrom: u32, start: u32, end: u32) -> Self { ZoomIntervalIter { + _r: std::marker::PhantomData, bbifile, known_offset: 0, blocks, @@ -1424,10 +1429,11 @@ where } } -impl<'a, I, B> Iterator for ZoomIntervalIter<'a, I, B> +impl Iterator for ZoomIntervalIter where I: Iterator + Send, - B: BBIRead, + R: BBIRead, + B: BorrowMut, { type Item = Result; @@ -1444,8 +1450,9 @@ where }, None => { let current_block = self.blocks.next()?; + let file = self.bbifile.borrow_mut(); match get_zoom_block_values( - self.bbifile, + file, current_block, &mut self.known_offset, self.chrom, diff --git a/bigtools/src/bbi/bigbedread.rs b/bigtools/src/bbi/bigbedread.rs index c99653f..99323af 100644 --- a/bigtools/src/bbi/bigbedread.rs +++ b/bigtools/src/bbi/bigbedread.rs @@ -328,6 +328,29 @@ impl BigBedRead { let chrom = self.info.chrom_id(chrom_name)?; + let blocks = search_cir_tree(&self.info, &mut self.read, cir_tree, chrom_name, start, end)?; + Ok(ZoomIntervalIter::< + std::vec::IntoIter, + BigBedRead, + &mut BigBedRead, + >::new(self, blocks.into_iter(), chrom, start, end)) + } + + /// For a given chromosome, start, and end, returns an `Iterator` of the + /// intersecting `ZoomRecord`s. + pub fn get_zoom_interval_move<'a>( + mut self, + chrom_name: &str, + start: u32, + end: u32, + reduction_level: u32, + ) -> Result>, ZoomIntervalError> { + let cir_tree = self + .zoom_cir_tree(reduction_level) + .map_err(|_| ZoomIntervalError::ReductionLevelNotFound)?; + + let chrom = self.info.chrom_id(chrom_name)?; + let blocks = search_cir_tree(&self.info, &mut self.read, cir_tree, chrom_name, start, end)?; Ok(ZoomIntervalIter::new( self, diff --git a/bigtools/src/bbi/bigwigread.rs b/bigtools/src/bbi/bigwigread.rs index b7f62b7..6047bfd 100644 --- a/bigtools/src/bbi/bigwigread.rs +++ b/bigtools/src/bbi/bigwigread.rs @@ -348,6 +348,28 @@ where let blocks = search_cir_tree(&self.info, &mut self.read, cir_tree, chrom_name, start, end)?; + Ok(ZoomIntervalIter::< + std::vec::IntoIter, + BigWigRead, + &mut BigWigRead, + >::new(self, blocks.into_iter(), chrom, start, end)) + } + + /// For a given chromosome, start, and end, returns an `Iterator` of the + /// intersecting `ZoomRecord`s. + pub fn get_zoom_interval_move<'a>( + mut self, + chrom_name: &str, + start: u32, + end: u32, + reduction_level: u32, + ) -> Result>, ZoomIntervalError> { + let cir_tree = self.zoom_cir_tree(reduction_level)?; + + let chrom = self.info.chrom_id(chrom_name)?; + + let blocks = search_cir_tree(&self.info, &mut self.read, cir_tree, chrom_name, start, end)?; + Ok(ZoomIntervalIter::new( self, blocks.into_iter(), diff --git a/pybigtools/src/lib.rs b/pybigtools/src/lib.rs index 37c6169..7f99fe8 100644 --- a/pybigtools/src/lib.rs +++ b/pybigtools/src/lib.rs @@ -1025,6 +1025,18 @@ impl BBIRead { Ok(info) } + fn zooms(&self) -> Vec { + let zooms = match &self.bbi { + BBIReadRaw::BigWigFile(b) => &b.info().zoom_headers, + BBIReadRaw::BigWigRemote(b) => &b.info().zoom_headers, + BBIReadRaw::BigWigFileLike(b) => &b.info().zoom_headers, + BBIReadRaw::BigBedFile(b) => &b.info().zoom_headers, + BBIReadRaw::BigBedRemote(b) => &b.info().zoom_headers, + BBIReadRaw::BigBedFileLike(b) => &b.info().zoom_headers, + }; + zooms.iter().map(|z| z.reduction_level).collect() + } + /// Returns the autosql of this bbi file. /// /// For bigBeds, this comes directly from the autosql stored in the file. @@ -1118,7 +1130,7 @@ impl BBIRead { BBIReadRaw::BigWigFile(b) => { let b = b.reopen()?; Ok(BigWigIntervalIterator { - iter: Box::new(b.get_interval_move(&chrom, start, end).unwrap()), + iter: Box::new(b.get_interval_move(&chrom, start, end).convert_err()?), } .into_py(py)) } @@ -1126,21 +1138,21 @@ impl BBIRead { BBIReadRaw::BigWigRemote(b) => { let b = b.reopen()?; Ok(BigWigIntervalIterator { - iter: Box::new(b.get_interval_move(&chrom, start, end).unwrap()), + iter: Box::new(b.get_interval_move(&chrom, start, end).convert_err()?), } .into_py(py)) } BBIReadRaw::BigWigFileLike(b) => { let b = b.reopen()?; Ok(BigWigIntervalIterator { - iter: Box::new(b.get_interval_move(&chrom, start, end).unwrap()), + iter: Box::new(b.get_interval_move(&chrom, start, end).convert_err()?), } .into_py(py)) } BBIReadRaw::BigBedFile(b) => { let b = b.reopen()?; Ok(BigBedEntriesIterator { - iter: Box::new(b.get_interval_move(&chrom, start, end).unwrap()), + iter: Box::new(b.get_interval_move(&chrom, start, end).convert_err()?), } .into_py(py)) } @@ -1148,20 +1160,97 @@ impl BBIRead { BBIReadRaw::BigBedRemote(b) => { let b = b.reopen()?; Ok(BigBedEntriesIterator { - iter: Box::new(b.get_interval_move(&chrom, start, end).unwrap()), + iter: Box::new(b.get_interval_move(&chrom, start, end).convert_err()?), } .into_py(py)) } BBIReadRaw::BigBedFileLike(b) => { let b = b.reopen()?; Ok(BigBedEntriesIterator { - iter: Box::new(b.get_interval_move(&chrom, start, end).unwrap()), + iter: Box::new(b.get_interval_move(&chrom, start, end).convert_err()?), } .into_py(py)) } } } + /// Returns the zoom records of a given range on a chromosome for a given zoom level. + /// + /// The result is an iterator of tuples. These tuples are in the format + /// (start: int, end: int, summary: dict). + /// + /// The chrom argument is the name of the chromosome. + /// The start and end arguments denote the range to get values for. + /// If end is not provided, it defaults to the length of the chromosome. + /// If start is not provided, it defaults to the beginning of the chromosome. + fn zoom_records( + &mut self, + reduction_level: u32, + chrom: String, + start: Option, + end: Option, + ) -> PyResult { + let (start, end) = start_end(&self.bbi, &chrom, start, end)?; + match &self.bbi { + BBIReadRaw::BigWigFile(b) => { + let b = b.reopen()?; + let iter = b + .get_zoom_interval_move(&chrom, start, end, reduction_level) + .convert_err()?; + Ok(ZoomIntervalIterator { + iter: Box::new(iter), + }) + } + #[cfg(feature = "remote")] + BBIReadRaw::BigWigRemote(b) => { + let b = b.reopen()?; + let iter = b + .get_zoom_interval_move(&chrom, start, end, reduction_level) + .convert_err()?; + Ok(ZoomIntervalIterator { + iter: Box::new(iter), + }) + } + BBIReadRaw::BigWigFileLike(b) => { + let b = b.reopen()?; + let iter = b + .get_zoom_interval_move(&chrom, start, end, reduction_level) + .convert_err()?; + Ok(ZoomIntervalIterator { + iter: Box::new(iter), + }) + } + BBIReadRaw::BigBedFile(b) => { + let b = b.reopen()?; + let iter = b + .get_zoom_interval_move(&chrom, start, end, reduction_level) + .convert_err()?; + Ok(ZoomIntervalIterator { + iter: Box::new(iter), + }) + } + #[cfg(feature = "remote")] + BBIReadRaw::BigBedRemote(b) => { + let b = b.reopen()?; + let iter = b + .get_zoom_interval_move(&chrom, start, end, reduction_level) + .convert_err()?; + Ok(ZoomIntervalIterator { + iter: Box::new(iter), + }) + } + BBIReadRaw::BigBedFileLike(b) => { + let b = b.reopen()?; + let iter = b + .get_zoom_interval_move(&chrom, start, end, reduction_level) + .convert_err()?; + Ok(ZoomIntervalIterator { + iter: Box::new(iter), + }) + } + } + } + /// Returns the values of a given range on a chromosome. /// /// For bigWigs, the result is an array of length (end - start). @@ -1265,6 +1354,39 @@ impl BBIRead { } } +#[pyclass(module = "pybigtools")] +struct ZoomIntervalIterator { + iter: Box> + Send>, +} + +#[pymethods] +impl ZoomIntervalIterator { + fn __iter__(slf: PyRefMut) -> PyResult> { + Ok(slf.into()) + } + + fn __next__(mut slf: PyRefMut) -> PyResult> { + slf.iter + .next() + .transpose() + .map(|o| { + o.map(|v| { + let summary = [ + ("total_items", v.summary.total_items.to_object(slf.py())), + ("bases_covered", v.summary.bases_covered.to_object(slf.py())), + ("min_val", v.summary.min_val.to_object(slf.py())), + ("max_val", v.summary.max_val.to_object(slf.py())), + ("sum", v.summary.sum.to_object(slf.py())), + ("sum_squares", v.summary.sum_squares.to_object(slf.py())), + ] + .to_object(slf.py()); + (v.start, v.end, summary) + }) + }) + .convert_err() + } +} + /// This class is an iterator for `Values` from a bigWig. /// It returns only values that exist in the bigWig, skipping /// any missing intervals. From bf528f0d13801155063036af34838ab01092018b Mon Sep 17 00:00:00 2001 From: Jack Huey <31162821+jackh726@users.noreply.github.com> Date: Sun, 7 Apr 2024 02:53:49 -0400 Subject: [PATCH 13/30] Add close method, fix text signature of values, implement gc and context manager methods, add api test, and fix bigtools returning intervals starting at the end base --- bigtools/src/bbi/bbiread.rs | 8 ++- bigtools/src/bbi/bigbedread.rs | 5 ++ bigtools/src/bbi/bigwigread.rs | 6 +- pybigtools/src/file_like.rs | 2 +- pybigtools/src/lib.rs | 106 +++++++++++++++++++++++-------- pybigtools/tests/test_api.py | 111 +++++++++++++++++++++++++++++++++ 6 files changed, 207 insertions(+), 31 deletions(-) create mode 100644 pybigtools/tests/test_api.py diff --git a/bigtools/src/bbi/bbiread.rs b/bigtools/src/bbi/bbiread.rs index 891e513..0a44f7d 100644 --- a/bigtools/src/bbi/bbiread.rs +++ b/bigtools/src/bbi/bbiread.rs @@ -557,12 +557,18 @@ impl BBIFileRead for S { } } -pub struct CachedBBIFileRead { +pub struct CachedBBIFileRead { read: S, cir_tree_node_map: HashMap, Vec>>, block_data: HashMap>, } +impl CachedBBIFileRead { + pub fn inner_read(&self) -> &S { + &self.read + } +} + impl CachedBBIFileRead { pub fn new(read: S) -> Self { CachedBBIFileRead { diff --git a/bigtools/src/bbi/bigbedread.rs b/bigtools/src/bbi/bigbedread.rs index 99323af..bf8fca4 100644 --- a/bigtools/src/bbi/bigbedread.rs +++ b/bigtools/src/bbi/bigbedread.rs @@ -214,6 +214,11 @@ impl BigBedRead { }) } + /// Gets a reference to the inner `R` type, in order to access any info + pub fn inner_read(&self) -> &R { + &self.read + } + /// Returns the summary data from bigBed /// /// Note: For version 1 of bigBeds, there is no total summary. In that diff --git a/bigtools/src/bbi/bigwigread.rs b/bigtools/src/bbi/bigwigread.rs index 6047bfd..b5b7bb1 100644 --- a/bigtools/src/bbi/bigwigread.rs +++ b/bigtools/src/bbi/bigwigread.rs @@ -527,7 +527,7 @@ fn get_block_values( end: chrom_end, value, }; - if value.end >= start && value.start <= end { + if value.end > start && value.start < end { value.start = value.start.max(start); value.end = value.end.min(end); values.push(value) @@ -555,7 +555,7 @@ fn get_block_values( end: chrom_end, value, }; - if value.end >= start && value.start <= end { + if value.end > start && value.start < end { value.start = value.start.max(start); value.end = value.end.min(end); values.push(value) @@ -584,7 +584,7 @@ fn get_block_values( end: chrom_end, value, }; - if value.end >= start && value.start <= end { + if value.end > start && value.start < end { value.start = value.start.max(start); value.end = value.end.min(end); values.push(value) diff --git a/pybigtools/src/file_like.rs b/pybigtools/src/file_like.rs index 51873db..77a2c75 100644 --- a/pybigtools/src/file_like.rs +++ b/pybigtools/src/file_like.rs @@ -8,7 +8,7 @@ use pyo3::{ /// traits, calling into python io methods. #[derive(Clone)] pub struct PyFileLikeObject { - inner: PyObject, + pub(crate) inner: PyObject, } impl PyFileLikeObject { diff --git a/pybigtools/src/lib.rs b/pybigtools/src/lib.rs index 7f99fe8..af21c93 100644 --- a/pybigtools/src/lib.rs +++ b/pybigtools/src/lib.rs @@ -26,9 +26,9 @@ use file_like::PyFileLikeObject; use numpy::ndarray::ArrayViewMut; use numpy::PyArray1; use pyo3::exceptions::{self, PyTypeError}; -use pyo3::prelude::*; use pyo3::types::{IntoPyDict, PyAny, PyDict, PyFloat, PyInt, PyIterator, PyString, PyTuple}; -use pyo3::wrap_pyfunction; +use pyo3::{create_exception, wrap_pyfunction}; +use pyo3::{prelude::*, PyTraverseError, PyVisit}; use tokio::runtime; use url::Url; @@ -37,13 +37,16 @@ mod file_like; type ValueTuple = (u32, u32, f32); +create_exception!(pybigtools, BBIFileClosed, exceptions::PyException); + fn start_end( bbi: &BBIReadRaw, chrom_name: &str, - start: Option, - end: Option, + start: Option, + end: Option, ) -> PyResult<(u32, u32)> { let chroms = match &bbi { + BBIReadRaw::Closed => return Err(BBIFileClosed::new_err("File is closed.")), BBIReadRaw::BigWigFile(b) => b.chroms(), #[cfg(feature = "remote")] BBIReadRaw::BigWigRemote(b) => b.chroms(), @@ -63,7 +66,10 @@ fn start_end( } Some(c) => c.length, }; - return Ok((start.unwrap_or(0), end.unwrap_or(length))); + return Ok(( + start.map(|v| v.max(0) as u32).unwrap_or(0), + end.map(|v| (v.max(0) as u32).min(length)).unwrap_or(length), + )); } fn bigwig_start_end_length( @@ -112,6 +118,7 @@ impl Reopen for PyFileLikeObject { } enum BBIReadRaw { + Closed, BigWigFile(BigWigReadRaw>), #[cfg(feature = "remote")] BigWigRemote(BigWigReadRaw>), @@ -968,6 +975,7 @@ impl BBIRead { fn info(&mut self, py: Python<'_>) -> PyResult { let (info, summary) = match &mut self.bbi { + BBIReadRaw::Closed => return Err(BBIFileClosed::new_err("File is closed.")), BBIReadRaw::BigWigFile(b) => { let summary = b.get_summary()?; (b.info(), summary) @@ -1025,8 +1033,9 @@ impl BBIRead { Ok(info) } - fn zooms(&self) -> Vec { + fn zooms(&self) -> PyResult> { let zooms = match &self.bbi { + BBIReadRaw::Closed => return Err(BBIFileClosed::new_err("File is closed.")), BBIReadRaw::BigWigFile(b) => &b.info().zoom_headers, BBIReadRaw::BigWigRemote(b) => &b.info().zoom_headers, BBIReadRaw::BigWigFileLike(b) => &b.info().zoom_headers, @@ -1034,7 +1043,7 @@ impl BBIRead { BBIReadRaw::BigBedRemote(b) => &b.info().zoom_headers, BBIReadRaw::BigBedFileLike(b) => &b.info().zoom_headers, }; - zooms.iter().map(|z| z.reduction_level).collect() + Ok(zooms.iter().map(|z| z.reduction_level).collect()) } /// Returns the autosql of this bbi file. @@ -1051,19 +1060,19 @@ impl BBIRead { /// "fields": [(, , ), ...], /// } /// ``` - fn sql(&mut self, py: Python, parse: Option) -> PyResult { - let parse = parse.unwrap_or(false); - pub const BEDGRAPH: &str = r#" - table bedGraph - "bedGraph file" - ( - string chrom; "Reference sequence chromosome or scaffold" - uint chromStart; "Start position in chromosome" - uint chromEnd; "End position in chromosome" - float value; "Value for a given interval" - ) + #[pyo3(signature = (parse = false))] + fn sql(&mut self, py: Python, parse: bool) -> PyResult { + pub const BEDGRAPH: &str = r#"table bedGraph +"bedGraph file" +( + string chrom; "Reference sequence chromosome or scaffold" + uint chromStart; "Start position in chromosome" + uint chromEnd; "End position in chromosome" + float value; "Value for a given interval" +)\ "#; let schema = match &mut self.bbi { + BBIReadRaw::Closed => return Err(BBIFileClosed::new_err("File is closed.")), BBIReadRaw::BigWigFile(_) | BBIReadRaw::BigWigRemote(_) | BBIReadRaw::BigWigFileLike(_) => BEDGRAPH.to_string(), @@ -1122,11 +1131,12 @@ impl BBIRead { &mut self, py: Python<'_>, chrom: String, - start: Option, - end: Option, + start: Option, + end: Option, ) -> PyResult { let (start, end) = start_end(&self.bbi, &chrom, start, end)?; match &self.bbi { + BBIReadRaw::Closed => return Err(BBIFileClosed::new_err("File is closed.")), BBIReadRaw::BigWigFile(b) => { let b = b.reopen()?; Ok(BigWigIntervalIterator { @@ -1187,11 +1197,12 @@ impl BBIRead { &mut self, reduction_level: u32, chrom: String, - start: Option, - end: Option, + start: Option, + end: Option, ) -> PyResult { let (start, end) = start_end(&self.bbi, &chrom, start, end)?; match &self.bbi { + BBIReadRaw::Closed => return Err(BBIFileClosed::new_err("File is closed.")), BBIReadRaw::BigWigFile(b) => { let b = b.reopen()?; let iter = b @@ -1262,10 +1273,14 @@ impl BBIRead { /// The chrom argument is the name of the chromosome. /// The start and end arguments denote the range to get values for. /// If end is not provided, it defaults to the length of the chromosome. - /// If start is not provided, it defaults to the beginning of the chromosome. + /// If start is not provided, it defaults to the beginning of the chromosome. + /// The default oob value is `numpy.nan`. /// /// This returns a numpy array. - #[pyo3(signature = (chrom, start, end, bins=None, summary="mean".to_string(), exact=false, missing=0.0, oob=f64::NAN, arr=None))] + #[pyo3( + signature = (chrom, start, end, bins=None, summary="mean".to_string(), exact=false, missing=0.0, oob=f64::NAN, arr=None), + text_signature = r#"(chrom, start, end, bins=None, summary="mean", exact=False, missing=0.0, oob=..., arr=None)"#, + )] fn values( &mut self, py: Python<'_>, @@ -1290,6 +1305,7 @@ impl BBIRead { } }; match &mut self.bbi { + BBIReadRaw::Closed => return Err(BBIFileClosed::new_err("File is closed.")), BBIReadRaw::BigWigFile(b) => intervals_to_array( py, b, &chrom, start, end, bins, summary, exact, missing, oob, arr, ), @@ -1319,7 +1335,7 @@ impl BBIRead { /// If it is None, then all chroms will be returned. /// If it is a String, then the length of that chromosome will be returned. /// If the chromosome doesn't exist, nothing will be returned. - fn chroms(&mut self, py: Python, chrom: Option) -> Option { + fn chroms(&mut self, py: Python, chrom: Option) -> PyResult> { fn get_chrom_obj( b: &B, py: Python, @@ -1341,7 +1357,8 @@ impl BBIRead { ), } } - match &self.bbi { + Ok(match &self.bbi { + BBIReadRaw::Closed => return Err(BBIFileClosed::new_err("File is closed.")), BBIReadRaw::BigWigFile(b) => get_chrom_obj(b, py, chrom), #[cfg(feature = "remote")] BBIReadRaw::BigWigRemote(b) => get_chrom_obj(b, py, chrom), @@ -1350,7 +1367,41 @@ impl BBIRead { #[cfg(feature = "remote")] BBIReadRaw::BigBedRemote(b) => get_chrom_obj(b, py, chrom), BBIReadRaw::BigBedFileLike(b) => get_chrom_obj(b, py, chrom), + }) + } + + fn close(&mut self) { + self.bbi = BBIReadRaw::Closed; + } + + fn __enter__(slf: Py) -> Py { + slf + } + + fn __exit__( + slf: Py, + py: Python<'_>, + _exc_type: PyObject, + _exc_value: PyObject, + _exc_traceback: PyObject, + ) -> Py { + slf.borrow_mut(py).bbi = BBIReadRaw::Closed; + slf + } + + fn __traverse__(&self, visit: PyVisit<'_>) -> Result<(), PyTraverseError> { + match &self.bbi { + BBIReadRaw::Closed | BBIReadRaw::BigWigFile(_) | BBIReadRaw::BigBedFile(_) => {} + #[cfg(feature = "remote")] + BBIReadRaw::BigWigRemote(_) | BBIReadRaw::BigBedRemote(_) => {} + BBIReadRaw::BigWigFileLike(b) => visit.call(&b.inner_read().inner_read().inner)?, + BBIReadRaw::BigBedFileLike(b) => visit.call(&b.inner_read().inner_read().inner)?, } + Ok(()) + } + + fn __clear__(&mut self) { + self.close() } } @@ -1379,6 +1430,7 @@ impl ZoomIntervalIterator { ("sum", v.summary.sum.to_object(slf.py())), ("sum_squares", v.summary.sum_squares.to_object(slf.py())), ] + .into_py_dict(slf.py()) .to_object(slf.py()); (v.start, v.end, summary) }) @@ -2083,6 +2135,8 @@ fn bigWigAverageOverBed( #[pymodule] fn pybigtools(_py: Python, m: &PyModule) -> PyResult<()> { m.add("__version__", env!("CARGO_PKG_VERSION"))?; + m.add("BBIFileClosed", m.py().get_type::())?; + m.add_wrapped(wrap_pyfunction!(open))?; m.add_wrapped(wrap_pyfunction!(bigWigAverageOverBed))?; diff --git a/pybigtools/tests/test_api.py b/pybigtools/tests/test_api.py new file mode 100644 index 0000000..2cf004a --- /dev/null +++ b/pybigtools/tests/test_api.py @@ -0,0 +1,111 @@ +import pybigtools +import math +import numpy as np + +def test_api(): + b = pybigtools.open('../bigtools/resources/test/valid.bigWig', 'r') + + # Check type of file + assert b.is_bigbed == False + assert b.is_bigwig == True + + # No args => dict + assert b.chroms() == {'chr17': 83257441} + # Arg with chrom name => length + assert b.chroms('chr17') == 83257441 + # Missing chrom => None + assert b.chroms('chr11') == None + + # Get a list of zooms + assert b.zooms() == [10, 40, 160, 640, 2560, 10240, 40960, 163840, 655360, 2621440] + + # Even bigwigs have sql (a sql representing bedGraph) + assert "bedGraph" in b.sql() + # We can parse the sql + assert b.sql(True)['name'] == 'bedGraph' + + # (chrom, None, None) => all records on chrom + assert len(list(b.records('chr17'))) == 100000 + # (chrom, start, None) => all records from (start, ) + assert len(list(b.records('chr17', 100000))) == 91360 + # (chrom, start, end) => all records from (start, end) + assert len(list(b.records('chr17', 100000, 110000))) == 1515 + # Out of bounds start/end are truncated + assert len(list(b.records('chr17', -1000, 100000))) == 8641 + assert len(list(b.records('chr17', -1000, -500))) == 0 + assert len(list(b.records('chr17', 0, 84000000))) == 100000 + assert next(b.records('chr17')) == (59898, 59900, 0.06791999936103821) + # Unknown chrom => exception + try: + b.zoom_records('chr11') + except: + pass + else: + assert False + + # (chrom, None, None) => all records on chrom + assert len(list(b.zoom_records(10, 'chr17'))) == 13811 + # (chrom, start, None) => all records from (start, ) + assert len(list(b.zoom_records(10, 'chr17', 100000))) == 10872 + # (chrom, start, end) => all records from (start, end) + assert len(list(b.zoom_records(10, 'chr17', 100000, 110000))) == 766 + # Out of bounds start/end are truncated + assert len(list(b.zoom_records(10, 'chr17', -1000, 100000))) == 2940 + assert len(list(b.zoom_records(10, 'chr17', -1000, -500))) == 0 + assert len(list(b.zoom_records(10, 'chr17', 0, 84000000))) == 13811 + assert next(b.zoom_records(10, 'chr17', 0, 100000)) == (59898, 59908, {'total_items': 0, 'bases_covered': 10, 'min_val': 0.06791999936103821, 'max_val': 0.16627000272274017, 'sum': 1.4660000801086426, 'sum_squares': 0.2303919643163681}) + assert next(b.zoom_records(160, 'chr17', 0, 100000)) == (59898, 60058, {'total_items': 0, 'bases_covered': 160, 'min_val': 0.06791999936103821, 'max_val': 0.8688300251960754, 'sum': 101.3516616821289, 'sum_squares': 80.17473602294922}) + # Unknown zoom => exception + try: + b.zoom_records(0, 'chr17') + except: + pass + else: + assert False + # Unknown chrom => exception + try: + b.zoom_records(10, 'chr11') + except: + pass + else: + assert False + + assert len(list(b.values('chr17', 100000, 110000))) == 10000 + assert len(list(b.values('chr17', 100000, 110000, 10))) == 10 + assert b.values('chr17', 100000, 110000, 10)[0] == 0.37435242314338685 + assert b.values('chr17', 100000, 110000, 10, 'max')[0] == 1.1978399753570557 + assert b.values('chr17', 100000, 110000, 10, 'min')[0] == 0.05403999984264374 + assert b.values('chr17', 100000, 110000, 10, 'mean', True)[0] == 0.37885534041374924 + assert list(b.values('chr17', 59890, 59900, 10, 'mean', True)) == [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.06791999936103821, 0.06791999936103821] + assert list(b.values('chr17', 59890, 59900, 10, 'mean', True, -1.0)) == [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 0.06791999936103821, 0.06791999936103821] + assert math.isnan(b.values('chr17', -10, 10, 20, 'mean', True, 0.0)[0]) + assert not math.isnan(b.values('chr17', -10, 10, 20, 'mean', True, 0.0)[19]) + assert b.values('chr17', -10, 10, 20, 'mean', True, 0.0, 0.0)[0] == 0.0 + arr = np.zeros(20) + ret_arr = b.values('chr17', -10, 10, 20, 'mean', True, 0.0, np.nan, arr) + # The returned array is the same as the one passed, so both show the same values + assert math.isnan(arr[0]) + assert arr[19] == 0.0 + assert math.isnan(ret_arr[0]) + assert ret_arr[19] == 0.0 + + b.close() + # Closing means file is not usable + try: + b.chroms() + except: + pass + else: + assert False + + b = pybigtools.open('../bigtools/resources/test/valid.bigWig', 'r') + # Files are closed when exiting a context manager + with b: + pass + try: + b.chroms() + except: + pass + else: + assert False + From 075cc464d3c011a232bd7801268d97e64e35f9e1 Mon Sep 17 00:00:00 2001 From: Jack Huey <31162821+jackh726@users.noreply.github.com> Date: Mon, 8 Apr 2024 10:07:48 -0400 Subject: [PATCH 14/30] Move bigWigAverageOverBed to BBIFileRead --- pybigtools/src/lib.rs | 236 ++++++++++++++++++------------------------ 1 file changed, 101 insertions(+), 135 deletions(-) diff --git a/pybigtools/src/lib.rs b/pybigtools/src/lib.rs index af21c93..99a5313 100644 --- a/pybigtools/src/lib.rs +++ b/pybigtools/src/lib.rs @@ -980,6 +980,7 @@ impl BBIRead { let summary = b.get_summary()?; (b.info(), summary) } + #[cfg(feature = "remote")] BBIReadRaw::BigWigRemote(b) => { let summary = b.get_summary()?; (b.info(), summary) @@ -992,6 +993,7 @@ impl BBIRead { let summary = b.get_summary()?; (b.info(), summary) } + #[cfg(feature = "remote")] BBIReadRaw::BigBedRemote(b) => { let summary = b.get_summary()?; (b.info(), summary) @@ -1037,9 +1039,11 @@ impl BBIRead { let zooms = match &self.bbi { BBIReadRaw::Closed => return Err(BBIFileClosed::new_err("File is closed.")), BBIReadRaw::BigWigFile(b) => &b.info().zoom_headers, + #[cfg(feature = "remote")] BBIReadRaw::BigWigRemote(b) => &b.info().zoom_headers, BBIReadRaw::BigWigFileLike(b) => &b.info().zoom_headers, BBIReadRaw::BigBedFile(b) => &b.info().zoom_headers, + #[cfg(feature = "remote")] BBIReadRaw::BigBedRemote(b) => &b.info().zoom_headers, BBIReadRaw::BigBedFileLike(b) => &b.info().zoom_headers, }; @@ -1073,10 +1077,11 @@ impl BBIRead { "#; let schema = match &mut self.bbi { BBIReadRaw::Closed => return Err(BBIFileClosed::new_err("File is closed.")), - BBIReadRaw::BigWigFile(_) - | BBIReadRaw::BigWigRemote(_) - | BBIReadRaw::BigWigFileLike(_) => BEDGRAPH.to_string(), + BBIReadRaw::BigWigFile(_) | BBIReadRaw::BigWigFileLike(_) => BEDGRAPH.to_string(), + #[cfg(feature = "remote")] + BBIReadRaw::BigWigRemote(_) => BEDGRAPH.to_string(), BBIReadRaw::BigBedFile(b) => b.autosql().convert_err()?, + #[cfg(feature = "remote")] BBIReadRaw::BigBedRemote(b) => b.autosql().convert_err()?, BBIReadRaw::BigBedFileLike(b) => b.autosql().convert_err()?, }; @@ -1370,6 +1375,85 @@ impl BBIRead { }) } + /// Gets the average values from a bigWig over the entries of a bed file. + /// Raises an exception if the current file is a bigBed. + /// + /// Parameters: + /// bed (str): The path to the bed. + /// names (None, bool, or int): + /// If `None`, then each return value will be a single `float`, + /// the average value over an interval in the bed file. + /// If `True`, then each return value will be a tuple of the value of column 4 + /// and the average value over the interval with that name in the bed file. + /// If `False`, then each return value will be a tuple of the interval in the format + /// `{chrom}:{start}-{end}` and the average value over that interval. + /// If `0`, then each return value will match as if `False` was passed. + /// If a `1+`, then each return value will be a tuple of the value of column of this + /// parameter (1-based) and the average value over the interval. + /// + /// Returns: + /// This returns a generator of values. (Therefore, to save to a list, do `list(bigWigAverageOverBed(...))`) + /// If no name is specified (see the `names` parameter above), then returns a generator of `float`s. + /// If a name column is specified (see above), then returns a generator of tuples `({name}, {average})` + fn average_over_bed( + &mut self, + py: Python, + bed: String, + names: Option, + ) -> PyResult { + let (name, usename) = { + match names { + Some(names) => match names.extract::(py) { + Ok(true) => (Name::Column(3), true), + Ok(false) => (Name::None, true), + Err(_) => match names.extract::(py) { + Ok(col) => match col { + 0 => (Name::None, true), + 1.. => (Name::Column((col - 1) as usize), true), + _ => { + return Err(PyErr::new::( + "Invalid names argument. Must be >= 0.", + )); + } + }, + Err(_) => { + return Err(PyErr::new::("Invalid names argument. Should be either `None`, a `bool`, or an `int`")); + } + }, + }, + None => (Name::None, false), + } + }; + let bedin = BufReader::new(File::open(bed)?); + + let res = match &mut self.bbi { + BBIReadRaw::Closed => return Err(BBIFileClosed::new_err("File is closed.")), + BBIReadRaw::BigWigFile(b) => { + let b = b.reopen()?; + let iter = Box::new(bigwig_average_over_bed(bedin, b, name)); + BigWigAverageOverBedEntriesIterator { iter, usename }.into_py(py) + } + #[cfg(feature = "remote")] + BBIReadRaw::BigWigRemote(b) => { + let b = b.reopen()?; + let iter = Box::new(bigwig_average_over_bed(bedin, b, name)); + BigWigAverageOverBedEntriesIterator { iter, usename }.into_py(py) + } + BBIReadRaw::BigWigFileLike(b) => { + let b = b.reopen()?; + let iter = Box::new(bigwig_average_over_bed(bedin, b, name)); + BigWigAverageOverBedEntriesIterator { iter, usename }.into_py(py) + } + BBIReadRaw::BigBedFile(_) | BBIReadRaw::BigBedFileLike(_) => { + return Err(BBIFileClosed::new_err("Not a bigWig.")) + } + #[cfg(feature = "remote")] + BBIReadRaw::BigBedRemote(_) => return Err(BBIFileClosed::new_err("Not a bigWig.")), + }; + + Ok(res) + } + fn close(&mut self) { self.bbi = BBIReadRaw::Closed; } @@ -1798,19 +1882,22 @@ impl BigWigAverageOverBedEntriesIterator { fn __next__( mut slf: PyRefMut, ) -> PyResult> { - slf.iter + let v = slf + .iter .next() .transpose() - .map(|o| { - o.map(|v| { - if slf.usename { - BigWigAverageOverBedEntriesIteratorRet::WithName((v.name, v.mean)) - } else { - BigWigAverageOverBedEntriesIteratorRet::Single(v.mean) - } - }) - }) - .map_err(|e| PyErr::new::(format!("{}", e))) + .map_err(|e| PyErr::new::(format!("{}", e)))?; + + let Some(v) = v else { + return Ok(None); + }; + + let item = if slf.usename { + BigWigAverageOverBedEntriesIteratorRet::WithName((v.name, v.mean)) + } else { + BigWigAverageOverBedEntriesIteratorRet::Single(v.mean) + }; + Ok(Some(item)) } } @@ -2011,126 +2098,6 @@ fn open_path_or_url( Ok(res) } -/// Gets the average values from a bigWig over the entries of a bed file. -/// -/// Parameters: -/// bigWig (str): The path to the bigWig. -/// bed (str): The path to the bed. -/// names (None, bool, or int): -/// If `None`, then each return value will be a single `float`, -/// the average value over an interval in the bed file. -/// If `True`, then each return value will be a tuple of the value of column 4 -/// and the average value over the interval with that name in the bed file. -/// If `False`, then each return value will be a tuple of the interval in the format -/// `{chrom}:{start}-{end}` and the average value over that interval. -/// If `0`, then each return value will match as if `False` was passed. -/// If a `1+`, then each return value will be a tuple of the value of column of this -/// parameter (1-based) and the average value over the interval. -/// -/// Returns: -/// This returns a generator of values. (Therefore, to save to a list, do `list(bigWigAverageOverBed(...))`) -/// If no name is specified (see the `names` parameter above), then returns a generator of `float`s. -/// If a name column is specified (see above), then returns a generator of tuples `({name}, {average})` -#[pyfunction] -fn bigWigAverageOverBed( - py: Python, - bigwig: String, - bed: String, - names: Option, -) -> PyResult { - let extension = match &Path::new(&bigwig).extension().map(|e| e.to_string_lossy()) { - Some(e) => e.to_string(), - None => { - return Err(PyErr::new::(format!( - "Invalid file type. Must be a bigWig (.bigWig, .bw)." - ))); - } - }; - let isfile = std::path::Path::new(&bigwig).exists(); - if !isfile { - match Url::parse(&bigwig) { - Ok(_) => {} - Err(_) => { - return Err(PyErr::new::(format!( - "Invalid file path. The file does not exists and it is not a url." - ))) - } - } - } - let (name, usename) = { - match names { - Some(names) => match names.extract::(py) { - Ok(b) => { - if b { - (Name::Column(3), true) - } else { - (Name::None, true) - } - } - Err(_) => match names.extract::(py) { - Ok(col) => match col { - 0 => (Name::None, true), - 1.. => (Name::Column((col - 1) as usize), true), - _ => { - return Err(PyErr::new::( - "Invalid names argument. Must be >= 0.", - )); - } - }, - Err(_) => { - return Err(PyErr::new::("Invalid names argument. Should be either `None`, a `bool`, or an `int`")); - } - }, - }, - None => (Name::None, false), - } - }; - let res = match extension.as_ref() { - "bw" | "bigWig" | "bigwig" => { - if isfile { - let read = BigWigReadRaw::open_file(&bigwig) - .map_err(|_| { - PyErr::new::(format!("Error opening bigWig.")) - })? - .cached(); - let bedin = BufReader::new(File::open(bed)?); - let iter = Box::new(bigwig_average_over_bed(bedin, read, name)); - - BigWigAverageOverBedEntriesIterator { iter, usename }.into_py(py) - } else { - #[cfg(feature = "remote")] - { - let read = BigWigReadRaw::open(RemoteFile::new(&bigwig)) - .map_err(|_| { - PyErr::new::(format!( - "Error opening bigBed." - )) - })? - .cached(); - let bedin = BufReader::new(File::open(bed)?); - let iter = Box::new(bigwig_average_over_bed(bedin, read, name)); - - BigWigAverageOverBedEntriesIterator { iter, usename }.into_py(py) - } - - #[cfg(not(feature = "remote"))] - { - return Err(PyErr::new::(format!( - "Builtin support for remote files is not available on this platform." - ))); - } - } - } - _ => { - return Err(PyErr::new::(format!( - "Invalid file type. Must be a bigWig (.bigWig, .bw)." - ))); - } - }; - - Ok(res) -} - /// The base module for opening a bigWig or bigBed. The only defined function is `open`. #[pymodule] fn pybigtools(_py: Python, m: &PyModule) -> PyResult<()> { @@ -2138,7 +2105,6 @@ fn pybigtools(_py: Python, m: &PyModule) -> PyResult<()> { m.add("BBIFileClosed", m.py().get_type::())?; m.add_wrapped(wrap_pyfunction!(open))?; - m.add_wrapped(wrap_pyfunction!(bigWigAverageOverBed))?; m.add_class::()?; m.add_class::()?; From a69bf96e63c871108a23d39e31efedefa98ee9c9 Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Wed, 10 Apr 2024 01:24:44 -0400 Subject: [PATCH 15/30] Use specialized Python exceptions --- pybigtools/src/lib.rs | 118 ++++++++++++++++++++++-------------------- 1 file changed, 63 insertions(+), 55 deletions(-) diff --git a/pybigtools/src/lib.rs b/pybigtools/src/lib.rs index 99a5313..b22393e 100644 --- a/pybigtools/src/lib.rs +++ b/pybigtools/src/lib.rs @@ -16,7 +16,7 @@ use bigtools::utils::misc::{ bigwig_average_over_bed, BigWigAverageOverBedEntry, BigWigAverageOverBedError, Name, }; use bigtools::{ - BBIFileRead, BBIReadError, BedEntry, BigBedRead as BigBedReadRaw, + BBIFileRead, BBIReadError as _BBIReadError, BedEntry, BigBedRead as BigBedReadRaw, BigBedWrite as BigBedWriteRaw, BigWigRead as BigWigReadRaw, BigWigWrite as BigWigWriteRaw, CachedBBIFileRead, Value, ZoomRecord, }; @@ -38,6 +38,7 @@ mod file_like; type ValueTuple = (u32, u32, f32); create_exception!(pybigtools, BBIFileClosed, exceptions::PyException); +create_exception!(pybigtools, BBIReadError, exceptions::PyException); fn start_end( bbi: &BBIReadRaw, @@ -59,7 +60,7 @@ fn start_end( let chrom = chroms.into_iter().find(|x| x.name == chrom_name); let length = match chrom { None => { - return Err(PyErr::new::(format!( + return Err(PyErr::new::(format!( "No chromomsome with name `{}` found.", chrom_name ))) @@ -101,7 +102,7 @@ fn start_end_length_inner( let chrom = chroms.into_iter().find(|x| x.name == chrom_name); let length = match chrom { None => { - return Err(PyErr::new::(format!( + return Err(PyErr::new::(format!( "No chromomsome with name `{}` found.", chrom_name ))) @@ -140,14 +141,21 @@ trait ToPyErr { fn to_py_err(self) -> PyErr; } -impl ToPyErr for bigtools::ZoomIntervalError { +impl ToPyErr for bigtools::BBIReadError { fn to_py_err(self) -> PyErr { - PyErr::new::(format!("{}", self)) + PyErr::new::(format!("{}", self)) } } -impl ToPyErr for bigtools::BBIReadError { +impl ToPyErr for bigtools::ZoomIntervalError { fn to_py_err(self) -> PyErr { - PyErr::new::(format!("{}", self)) + match self { + bigtools::ZoomIntervalError::ReductionLevelNotFound => { + PyErr::new::(format!( + "The passed reduction level was not found" + )) + } + _ => PyErr::new::(format!("{}", self)), + } } } @@ -192,7 +200,7 @@ fn intervals_to_array( (None, None) => PyArray1::from_vec(py, vec![missing; (end - start) as usize]).to_object(py), }; let v: &PyArray1 = arr.downcast::>(py).map_err(|_| { - PyErr::new::( + PyErr::new::( "`arr` option must be a one-dimensional numpy array, if passed.", ) })?; @@ -200,7 +208,7 @@ fn intervals_to_array( let bin_size = match bins { Some(bins) => { if v.len() != bins { - return Err(PyErr::new::(format!( + return Err(PyErr::new::(format!( "`arr` does not the expected size (expected `{}`, found `{}`), if passed.", bins, v.len(), @@ -251,8 +259,8 @@ fn intervals_to_array( } _ => { if v.len() != (end - start) as usize { - return Err(PyErr::new::(format!( - "`arr` does not the expected size (expected `{}`, found `{}`), if passed.", + return Err(PyErr::new::(format!( + "`arr` does not have the expected size (expected `{}`, found `{}`), if passed.", (end - start) as usize, v.len(), ))); @@ -327,7 +335,7 @@ fn entries_to_array( (None, None) => PyArray1::from_vec(py, vec![missing; (end - start) as usize]).to_object(py), }; let v: &PyArray1 = arr.downcast::>(py).map_err(|_| { - PyErr::new::( + PyErr::new::( "`arr` option must be a one-dimensional numpy array, if passed.", ) })?; @@ -335,8 +343,8 @@ fn entries_to_array( let bin_size = match bins { Some(bins) => { if v.len() != bins { - return Err(PyErr::new::(format!( - "`arr` does not the expected size (expected `{}`, found `{}`), if passed.", + return Err(PyErr::new::(format!( + "`arr` does not have the expected size (expected `{}`, found `{}`), if passed.", bins, v.len(), ))); @@ -386,8 +394,8 @@ fn entries_to_array( } _ => { if v.len() != (end - start) as usize { - return Err(PyErr::new::(format!( - "`arr` does not the expected size (expected `{}`, found `{}`), if passed.", + return Err(PyErr::new::(format!( + "`arr` does not have the expected size (expected `{}`, found `{}`), if passed.", (end - start) as usize, v.len(), ))); @@ -431,13 +439,13 @@ fn entries_to_array( Ok(arr) } -fn to_array>>( +fn to_array>>( start: i32, end: i32, iter: I, missing: f64, mut v: ArrayViewMut<'_, f64, numpy::Ix1>, -) -> Result<(), BBIReadError> { +) -> Result<(), _BBIReadError> { assert_eq!(v.len(), (end - start) as usize); v.fill(missing); for interval in iter { @@ -450,7 +458,7 @@ fn to_array>>( } Ok(()) } -fn to_array_bins>>( +fn to_array_bins>>( start: i32, end: i32, iter: I, @@ -458,7 +466,7 @@ fn to_array_bins>>( bins: usize, missing: f64, mut v: ArrayViewMut<'_, f64, numpy::Ix1>, -) -> Result<(), BBIReadError> { +) -> Result<(), _BBIReadError> { assert_eq!(v.len(), bins); v.fill(missing); @@ -556,7 +564,7 @@ fn to_array_bins>>( } Ok(()) } -fn to_array_zoom>>( +fn to_array_zoom>>( start: i32, end: i32, iter: I, @@ -564,7 +572,7 @@ fn to_array_zoom>>( bins: usize, missing: f64, mut v: ArrayViewMut<'_, f64, numpy::Ix1>, -) -> Result<(), BBIReadError> { +) -> Result<(), _BBIReadError> { assert_eq!(v.len(), bins); v.fill(missing); @@ -673,13 +681,13 @@ fn to_array_zoom>>( } Ok(()) } -fn to_entry_array>>( +fn to_entry_array>>( start: i32, end: i32, iter: I, missing: f64, mut v: ArrayViewMut<'_, f64, numpy::Ix1>, -) -> Result<(), BBIReadError> { +) -> Result<(), _BBIReadError> { assert_eq!(v.len(), (end - start) as usize); v.fill(missing); for interval in iter { @@ -692,7 +700,7 @@ fn to_entry_array>>( } Ok(()) } -fn to_entry_array_bins>>( +fn to_entry_array_bins>>( start: i32, end: i32, iter: I, @@ -700,7 +708,7 @@ fn to_entry_array_bins>>( bins: usize, missing: f64, mut v: ArrayViewMut<'_, f64, numpy::Ix1>, -) -> Result<(), BBIReadError> { +) -> Result<(), _BBIReadError> { assert_eq!(v.len(), bins); v.fill(missing); @@ -806,7 +814,7 @@ fn to_entry_array_bins>>( Ok(()) } -fn to_entry_array_zoom>>( +fn to_entry_array_zoom>>( start: i32, end: i32, iter: I, @@ -814,7 +822,7 @@ fn to_entry_array_zoom>>( bins: usize, missing: f64, mut v: ArrayViewMut<'_, f64, numpy::Ix1>, -) -> Result<(), BBIReadError> { +) -> Result<(), _BBIReadError> { assert_eq!(v.len(), bins); v.fill(missing); @@ -1086,11 +1094,10 @@ impl BBIRead { BBIReadRaw::BigBedFileLike(b) => b.autosql().convert_err()?, }; let obj = if parse { - let mut declarations = parse_autosql(&schema).map_err(|_| { - PyErr::new::("Unable to parse autosql.") - })?; + let mut declarations = parse_autosql(&schema) + .map_err(|_| PyErr::new::("Unable to parse autosql."))?; if declarations.len() > 1 { - return Err(PyErr::new::( + return Err(PyErr::new::( "Unexpected extra declarations.", )); } @@ -1304,7 +1311,7 @@ impl BBIRead { "min" => Summary::Min, "max" => Summary::Max, _ => { - return Err(PyErr::new::(format!( + return Err(PyErr::new::(format!( "Unrecognized summary. Only `mean`, `min`, and `max` are allowed." ))); } @@ -1417,7 +1424,7 @@ impl BBIRead { } }, Err(_) => { - return Err(PyErr::new::("Invalid names argument. Should be either `None`, a `bool`, or an `int`")); + return Err(PyErr::new::("Invalid names argument. Should be either `None`, a `bool`, or an `int`")); } }, }, @@ -1491,7 +1498,7 @@ impl BBIRead { #[pyclass(module = "pybigtools")] struct ZoomIntervalIterator { - iter: Box> + Send>, + iter: Box> + Send>, } #[pymethods] @@ -1528,7 +1535,7 @@ impl ZoomIntervalIterator { /// any missing intervals. #[pyclass(module = "pybigtools")] struct BigWigIntervalIterator { - iter: Box> + Send>, + iter: Box> + Send>, } #[pymethods] @@ -1549,7 +1556,7 @@ impl BigWigIntervalIterator { /// This class is an interator for the entries in a bigBed file. #[pyclass(module = "pybigtools")] struct BigBedEntriesIterator { - iter: Box> + Send>, + iter: Box> + Send>, } #[pymethods] @@ -1600,7 +1607,7 @@ impl BigWigWrite { let bigwig = self .bigwig .take() - .ok_or_else(|| PyErr::new::("File already closed."))?; + .ok_or_else(|| PyErr::new::("File already closed."))?; let chrom_map = chroms .into_iter() .map(|(key, val)| { @@ -1738,7 +1745,7 @@ impl BigBedWrite { let bigbed = self .bigbed .take() - .ok_or_else(|| PyErr::new::("File already closed."))?; + .ok_or_else(|| PyErr::new::("File already closed."))?; let chrom_map = chroms .into_iter() .map(|(key, val)| { @@ -1928,7 +1935,7 @@ fn open(py: Python, path_url_or_file_like: PyObject, mode: Option) -> Py Some(mode) if mode == "r" => false, None => false, Some(mode) => { - return Err(PyErr::new::(format!( + return Err(PyErr::new::(format!( "Invalid mode: `{}`", mode ))); @@ -1941,13 +1948,13 @@ fn open(py: Python, path_url_or_file_like: PyObject, mode: Option) -> Py } if iswrite { - return Err(PyErr::new::(format!( + return Err(PyErr::new::(format!( "Writing only supports file names", ))); } let file_like = match PyFileLikeObject::new(path_url_or_file_like, true, false, true) { Ok(file_like) => file_like, - Err(_) => return Err(PyErr::new::(format!( + Err(_) => return Err(PyErr::new::(format!( "Unknown argument for `path_url_or_file_like`. Not a file path string or url, and not a file-like object.", ))), }; @@ -1962,7 +1969,7 @@ fn open(py: Python, path_url_or_file_like: PyObject, mode: Option) -> Py } .into_py(py), Err(e) => { - return Err(PyErr::new::(format!( + return Err(PyErr::new::(format!( "File-like object is not a bigWig or bigBed. Or there was just a problem reading: {e}", ))) } @@ -1982,13 +1989,13 @@ fn open_path_or_url( { Some(e) => e.to_string(), None => { - return Err(PyErr::new::(format!("Invalid file type. Must be either a bigWig (.bigWig, .bw) or bigBed (.bigBed, .bb)."))); + return Err(PyErr::new::(format!("Invalid file type. Must be either a bigWig (.bigWig, .bw) or bigBed (.bigBed, .bb)."))); } }; let res = if iswrite { match Url::parse(&path_url_or_file_like) { Ok(_) => { - return Err(PyErr::new::(format!( + return Err(PyErr::new::(format!( "Invalid file path. Writing does not support urls." ))) } @@ -2004,7 +2011,7 @@ fn open_path_or_url( } .into_py(py), _ => { - return Err(PyErr::new::(format!("Invalid file type. Must be either a bigWig (.bigWig, .bw) or bigBed (.bigBed, .bb)."))); + return Err(PyErr::new::(format!("Invalid file type. Must be either a bigWig (.bigWig, .bw) or bigBed (.bigBed, .bb)."))); } } } else { @@ -2013,8 +2020,8 @@ fn open_path_or_url( match Url::parse(&path_url_or_file_like) { Ok(_) => {} Err(_) => { - return Err(PyErr::new::(format!( - "Invalid file path. The file does not exists and it is not a url." + return Err(PyErr::new::(format!( + "Invalid file path. The file does not exist and it is not a url." ))) } } @@ -2028,7 +2035,7 @@ fn open_path_or_url( } .into_py(py), Err(_) => { - return Err(PyErr::new::(format!( + return Err(PyErr::new::(format!( "Error opening bigWig." ))) } @@ -2041,7 +2048,7 @@ fn open_path_or_url( } .into_py(py), Err(_) => { - return Err(PyErr::new::(format!( + return Err(PyErr::new::(format!( "Error opening bigWig." ))) } @@ -2049,7 +2056,7 @@ fn open_path_or_url( #[cfg(not(feature = "remote"))] { - return Err(PyErr::new::(format!( + return Err(PyErr::new::(format!( "Builtin support for remote files is not supported on this platform." ))); } @@ -2063,7 +2070,7 @@ fn open_path_or_url( } .into_py(py), Err(_) => { - return Err(PyErr::new::(format!( + return Err(PyErr::new::(format!( "Error opening bigBed." ))) } @@ -2076,7 +2083,7 @@ fn open_path_or_url( } .into_py(py), Err(_) => { - return Err(PyErr::new::(format!( + return Err(PyErr::new::(format!( "Error opening bigBed." ))) } @@ -2084,14 +2091,14 @@ fn open_path_or_url( #[cfg(not(feature = "remote"))] { - return Err(PyErr::new::(format!( + return Err(PyErr::new::(format!( "Builtin support for remote files is not supported on this platform." ))); } } } _ => { - return Err(PyErr::new::(format!("Invalid file type. Must be either a bigWig (.bigWig, .bw) or bigBed (.bigBed, .bb)."))); + return Err(PyErr::new::(format!("Invalid file type. Must be either a bigWig (.bigWig, .bw) or bigBed (.bigBed, .bb)."))); } } }; @@ -2103,6 +2110,7 @@ fn open_path_or_url( fn pybigtools(_py: Python, m: &PyModule) -> PyResult<()> { m.add("__version__", env!("CARGO_PKG_VERSION"))?; m.add("BBIFileClosed", m.py().get_type::())?; + m.add("BBIReadError", m.py().get_type::())?; m.add_wrapped(wrap_pyfunction!(open))?; From a555056fe28e6406e4fb12cb97ec31a9ebcf149c Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Wed, 10 Apr 2024 01:24:56 -0400 Subject: [PATCH 16/30] test: Refactor tests Update test_bigwig tests Linting Update tests Update tests Add bigbed fixture Update tests Add test for info method --- pybigtools/tests/test_api.py | 338 ++++++++++++++++++++++++-------- pybigtools/tests/test_bigwig.py | 53 ++--- 2 files changed, 288 insertions(+), 103 deletions(-) diff --git a/pybigtools/tests/test_api.py b/pybigtools/tests/test_api.py index 2cf004a..4d3f29b 100644 --- a/pybigtools/tests/test_api.py +++ b/pybigtools/tests/test_api.py @@ -1,111 +1,291 @@ -import pybigtools import math +import pathlib +from io import BytesIO + import numpy as np +import pytest + +import pybigtools + +TEST_DIR = pathlib.Path(__file__).parent +REPO_ROOT = TEST_DIR.parent.parent + + +def test_open_close(): + path = str(REPO_ROOT / "bigtools/resources/test/valid.bigWig") + b = pybigtools.open(path, "r") + b.close() + assert pytest.raises(pybigtools.BBIFileClosed, b.chroms) + + # Files are closed when exiting a context manager + with pybigtools.open(path, "r") as b: + pass + assert pytest.raises(pybigtools.BBIFileClosed, b.chroms) + + # Invalid file-like object + s = BytesIO() + assert pytest.raises(pybigtools.BBIReadError, pybigtools.open, s, "r") + + +@pytest.fixture +def bw(): + path = str(REPO_ROOT / "bigtools/resources/test/valid.bigWig") + return pybigtools.open(path, "r") + + +@pytest.fixture +def bb(): + path = str(TEST_DIR / "data/bigBedExample.bb") + return pybigtools.open(path, "r") + + +def test_check_filetype(bw, bb): + assert not bw.is_bigbed + assert bw.is_bigwig -def test_api(): - b = pybigtools.open('../bigtools/resources/test/valid.bigWig', 'r') + assert bb.is_bigbed + assert not bb.is_bigwig - # Check type of file - assert b.is_bigbed == False - assert b.is_bigwig == True +def test_info(bw): + assert bw.info() == { + 'version': 4, + 'isCompressed': True, + 'primaryDataSize': 603305, + 'zoomLevels': 10, + 'chromCount': 1, + 'summary': { + 'basesCovered': 137894, + 'sum': 89001714.97238067, + 'mean': 645.4357330440822, + 'min': 0.006219999864697456, + 'max': 14254.0, + 'std': 751.0146556298351 + }, + } + + +def test_chroms(bw, bb): # No args => dict - assert b.chroms() == {'chr17': 83257441} + assert bw.chroms() == {"chr17": 83_257_441} + assert bb.chroms() == {"chr21": 48_129_895} + # Arg with chrom name => length - assert b.chroms('chr17') == 83257441 + assert bw.chroms("chr17") == 83257441 + assert bb.chroms("chr21") == 48129895 + # Missing chrom => None - assert b.chroms('chr11') == None + assert bw.chroms("chr11") is None + assert bb.chroms("chr11") is None + +def test_zooms(bw, bb): # Get a list of zooms - assert b.zooms() == [10, 40, 160, 640, 2560, 10240, 40960, 163840, 655360, 2621440] + assert bw.zooms() == [10, 40, 160, 640, 2560, 10240, 40960, 163840, 655360, 2621440] + assert bb.zooms() == [3911, 39110, 391100, 3911000, 39110000] + +def test_autosql(bw, bb): # Even bigwigs have sql (a sql representing bedGraph) - assert "bedGraph" in b.sql() + assert "bedGraph" in bw.sql() # We can parse the sql - assert b.sql(True)['name'] == 'bedGraph' + # assert bw.sql(True)['name'] == 'bedGraph' + + # bb.sql() + # BBIReadError: The file was invalid: Invalid autosql: not UTF-8 + +def test_records(bw, bb): # (chrom, None, None) => all records on chrom - assert len(list(b.records('chr17'))) == 100000 + assert len(list(bw.records("chr17"))) == 100_000 + assert len(list(bb.records("chr21"))) == 14_810 + # (chrom, start, None) => all records from (start, ) - assert len(list(b.records('chr17', 100000))) == 91360 + assert len(list(bw.records("chr17", 100_000))) == 91_360 + assert len(list(bb.records("chr21", 10_000_000))) == 14_799 + # (chrom, start, end) => all records from (start, end) - assert len(list(b.records('chr17', 100000, 110000))) == 1515 + assert len(list(bw.records("chr17", 100_000, 110_000))) == 1515 + assert len(list(bb.records("chr21", 10_000_000, 20_000_000))) == 233 + # Out of bounds start/end are truncated - assert len(list(b.records('chr17', -1000, 100000))) == 8641 - assert len(list(b.records('chr17', -1000, -500))) == 0 - assert len(list(b.records('chr17', 0, 84000000))) == 100000 - assert next(b.records('chr17')) == (59898, 59900, 0.06791999936103821) + x = list(bw.records("chr17", 0, 100_000)) + assert len(x) == 8641 + assert list(bw.records("chr17", -1000, 100_000)) == x + x = list(bb.records("chr21", 0, 10_000_000)) + assert len(x) == 11 + assert list(bb.records("chr21", -1000, 10_000_000)) == x + + y = list(bw.records("chr17", 0, bw.chroms("chr17"))) + assert len(y) == 100_000 + assert list(bw.records("chr17", 0, bw.chroms("chr17") * 2)) == y + assert next(bw.records("chr17")) == (59898, 59900, 0.06791999936103821) + y = list(bb.records("chr21", 0, bb.chroms("chr21"))) + assert len(y) == 14810 + assert list(bb.records("chr21", 0, bb.chroms("chr21") * 2)) == y + assert next(bb.records("chr21")) == (9434178, 9434609) + + # Fully out of bounds ranges return no records + assert len(list(bw.records("chr17", -1000, -500))) == 0 + assert len(list(bw.records("chr17", 83_257_441, 84_000_000))) == 0 + + assert len(list(bb.records("chr21", -1000, -500))) == 0 + assert len(list(bb.records("chr21", 48_129_895, 49_000_000))) == 0 + # Unknown chrom => exception - try: - b.zoom_records('chr11') - except: - pass - else: - assert False + assert pytest.raises(KeyError, bw.records, "chr11") + assert pytest.raises(KeyError, bb.records, "chr11") + +def test_zoom_records(bw, bb): # (chrom, None, None) => all records on chrom - assert len(list(b.zoom_records(10, 'chr17'))) == 13811 + assert len(list(bw.zoom_records(10, "chr17"))) == 13811 + assert len(list(bb.zoom_records(3911, "chr21"))) == 1676 + # (chrom, start, None) => all records from (start, ) - assert len(list(b.zoom_records(10, 'chr17', 100000))) == 10872 + assert len(list(bw.zoom_records(10, "chr17", 100_000))) == 10872 + assert len(list(bb.zoom_records(3911, "chr21", 10_000_000))) == 1670 + # (chrom, start, end) => all records from (start, end) - assert len(list(b.zoom_records(10, 'chr17', 100000, 110000))) == 766 + assert len(list(bw.zoom_records(10, "chr17", 100_000, 110_000))) == 766 + assert len(list(bb.zoom_records(3911, "chr21", 10_000_000, 20_000_000))) == 154 + # Out of bounds start/end are truncated - assert len(list(b.zoom_records(10, 'chr17', -1000, 100000))) == 2940 - assert len(list(b.zoom_records(10, 'chr17', -1000, -500))) == 0 - assert len(list(b.zoom_records(10, 'chr17', 0, 84000000))) == 13811 - assert next(b.zoom_records(10, 'chr17', 0, 100000)) == (59898, 59908, {'total_items': 0, 'bases_covered': 10, 'min_val': 0.06791999936103821, 'max_val': 0.16627000272274017, 'sum': 1.4660000801086426, 'sum_squares': 0.2303919643163681}) - assert next(b.zoom_records(160, 'chr17', 0, 100000)) == (59898, 60058, {'total_items': 0, 'bases_covered': 160, 'min_val': 0.06791999936103821, 'max_val': 0.8688300251960754, 'sum': 101.3516616821289, 'sum_squares': 80.17473602294922}) + x = list(bw.zoom_records(10, "chr17", 0, 100_000)) + assert len(x) == 2940 + assert list(bw.zoom_records(10, "chr17", -1000, 100_000)) == x + x = list(bb.zoom_records(3911, "chr21", 0, 10_000_000)) + assert len(x) == 6 + assert list(bb.zoom_records(3911, "chr21", -1000, 10_000_000)) == x + + y = list(bw.zoom_records(10, "chr17", 0, bw.chroms("chr17"))) + assert len(y) == 13811 + assert list(bw.zoom_records(10, "chr17", 0, bw.chroms("chr17") * 2)) == y + y = list(bb.zoom_records(3911, "chr21", 0, bb.chroms("chr21"))) + assert len(y) == 1676 + assert list(bb.zoom_records(3911, "chr21", 0, bb.chroms("chr21") * 2)) == y + + # Fully out of bounds ranges return no records + assert len(list(bw.zoom_records(10, "chr17", -1000, -500))) == 0 + assert len(list(bw.zoom_records(10, "chr17", 83_257_441, 84_000_000))) == 0 + assert len(list(bb.zoom_records(3911, "chr21", -1000, -500))) == 0 + assert len(list(bb.zoom_records(3911, "chr21", 48_129_895, 49_000_000))) == 0 + + assert next(bw.zoom_records(10, "chr17", 0, 100_000)) == ( + 59898, + 59908, + { + "total_items": 0, + "bases_covered": 10, + "min_val": 0.06791999936103821, + "max_val": 0.16627000272274017, + "sum": 1.4660000801086426, + "sum_squares": 0.2303919643163681, + }, + ) + assert next(bw.zoom_records(160, "chr17", 0, 100000)) == ( + 59898, + 60058, + { + "total_items": 0, + "bases_covered": 160, + "min_val": 0.06791999936103821, + "max_val": 0.8688300251960754, + "sum": 101.3516616821289, + "sum_squares": 80.17473602294922, + }, + ) + # Unknown zoom => exception - try: - b.zoom_records(0, 'chr17') - except: - pass - else: - assert False + assert pytest.raises(KeyError, bw.zoom_records, 0, "chr17") + assert pytest.raises(KeyError, bb.zoom_records, 0, "chr21") + # Unknown chrom => exception - try: - b.zoom_records(10, 'chr11') - except: - pass - else: - assert False - - assert len(list(b.values('chr17', 100000, 110000))) == 10000 - assert len(list(b.values('chr17', 100000, 110000, 10))) == 10 - assert b.values('chr17', 100000, 110000, 10)[0] == 0.37435242314338685 - assert b.values('chr17', 100000, 110000, 10, 'max')[0] == 1.1978399753570557 - assert b.values('chr17', 100000, 110000, 10, 'min')[0] == 0.05403999984264374 - assert b.values('chr17', 100000, 110000, 10, 'mean', True)[0] == 0.37885534041374924 - assert list(b.values('chr17', 59890, 59900, 10, 'mean', True)) == [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.06791999936103821, 0.06791999936103821] - assert list(b.values('chr17', 59890, 59900, 10, 'mean', True, -1.0)) == [-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, 0.06791999936103821, 0.06791999936103821] - assert math.isnan(b.values('chr17', -10, 10, 20, 'mean', True, 0.0)[0]) - assert not math.isnan(b.values('chr17', -10, 10, 20, 'mean', True, 0.0)[19]) - assert b.values('chr17', -10, 10, 20, 'mean', True, 0.0, 0.0)[0] == 0.0 - arr = np.zeros(20) - ret_arr = b.values('chr17', -10, 10, 20, 'mean', True, 0.0, np.nan, arr) + assert pytest.raises(KeyError, bw.zoom_records, 10, "chr11") + assert pytest.raises(KeyError, bb.zoom_records, 3911, "chr11") + + +def test_values(bw, bb): + assert len(bw.values("chr17", 100_000, 110_000)) == 10_000 + assert len(bb.values("chr21", 10_148_000, 10_158_000)) == 10_000 + + assert len(bw.values("chr17", 100000, 110000, 10)) == 10 + assert len(bb.values("chr21", 10_148_000, 10_158_000, 10)) == 10 + + assert bw.values("chr17", 100000, 110000, 10)[0] == 0.37435242314338685 + assert bb.values("chr21", 10_148_000, 10_158_000, 10)[0] == 0.175 + + assert bw.values("chr17", 100000, 110000, 10, "max")[0] == 1.1978399753570557 + assert bb.values("chr21", 10_148_000, 10_158_000, 10, "max")[0] == 1.0 + + assert bw.values("chr17", 100000, 110000, 10, "min")[0] == 0.05403999984264374 + assert bb.values("chr21", 10_148_000, 10_158_000, 10, "min")[0] == 0.0 + + assert ( + bw.values("chr17", 100000, 110000, 10, "mean", exact=True)[0] + == 0.37885534041374924 + ) + assert ( + bb.values("chr21", 10_148_000, 10_158_000, 10, "mean", exact=True)[0] + == 0.175 + ) + + assert list(bw.values("chr17", 59890, 59900, 10, "mean", exact=True)) == [ + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.06791999936103821, + 0.06791999936103821, + ] + + assert list( + bw.values("chr17", 59890, 59900, 10, "mean", exact=True, missing=-1.0) + ) == [ + -1.0, + -1.0, + -1.0, + -1.0, + -1.0, + -1.0, + -1.0, + -1.0, + 0.06791999936103821, + 0.06791999936103821, + ] + + x = bw.values("chr17", -10, 10, 20, "mean", exact=True, missing=0.0) + assert math.isnan(x[0]) + assert not math.isnan(x[19]) + x = bb.values("chr21", -10, 10, 20, "mean", exact=True, missing=0.0) + assert math.isnan(x[0]) + assert not math.isnan(x[19]) + + x = bw.values("chr17", -10, 10, 20, "mean", exact=True, missing=0.0, oob=0.0) + assert x[0] == 0.0 + x = bb.values("chr21", -10, 10, 20, "mean", exact=True, missing=0.0, oob=0.0) + assert x[0] == 0.0 + # The returned array is the same as the one passed, so both show the same values + arr = np.zeros(20) + ret_arr = bw.values( + "chr17", -10, 10, 20, "mean", exact=True, missing=0.0, oob=np.nan, arr=arr + ) assert math.isnan(arr[0]) assert arr[19] == 0.0 assert math.isnan(ret_arr[0]) assert ret_arr[19] == 0.0 + assert np.array_equal(arr, ret_arr, equal_nan=True) - b.close() - # Closing means file is not usable - try: - b.chroms() - except: - pass - else: - assert False - - b = pybigtools.open('../bigtools/resources/test/valid.bigWig', 'r') - # Files are closed when exiting a context manager - with b: - pass - try: - b.chroms() - except: - pass - else: - assert False - + ret_arr = bb.values( + "chr21", -10, 10, 20, "mean", exact=True, missing=0.0, oob=np.nan, arr=arr + ) + assert math.isnan(arr[0]) + assert arr[19] == 0.0 + assert math.isnan(ret_arr[0]) + assert ret_arr[19] == 0.0 + assert np.array_equal(arr, ret_arr, equal_nan=True) diff --git a/pybigtools/tests/test_bigwig.py b/pybigtools/tests/test_bigwig.py index 95ff918..ff3191b 100644 --- a/pybigtools/tests/test_bigwig.py +++ b/pybigtools/tests/test_bigwig.py @@ -1,40 +1,45 @@ +import math import os +import pathlib +import urllib.request -def install_dependencies(): - import urllib.request - if not os.path.exists('./ENCFF667CZO.bigWig'): - print("Downloading test bigWig") - urllib.request.urlretrieve('https://www.encodeproject.org/files/ENCFF667CZO/@@download/ENCFF667CZO.bigWig', './ENCFF667CZO.bigWig') +import pybigtools -print("Testing") -# Imports for test -import math +TEST_DIR = pathlib.Path(__file__).parent -# Test -import pybigtools -def test_bigwig_write_read(tmp_path): - install_dependencies() - # bigWig read - b = pybigtools.open('./ENCFF667CZO.bigWig') +def retrieve_encode_file(accession, filetype): + file_url = f"https://www.encodeproject.org/files/{accession}/@@download/{accession}.{filetype}" + path = TEST_DIR / f"data/{accession}.{filetype}" + if not path.exists(): + urllib.request.urlretrieve(file_url, path) + return path + - i = b.intervals("chr1") +def test_bigwig_read(): + path = retrieve_encode_file("ENCFF667CZO", "bigWig") + b = pybigtools.open(str(path)) + + i = b.records("chr1") n = next(i) assert n[0] == 10495 assert n[1] == 10545 assert math.isclose(n[2], 0.01591, abs_tol=0.00001) - i = b.intervals("chr12") + i = b.records("chr12") c = 0 for _ in i: c += 1 assert c == 1028747 - # bigWig write - chroms = ['chr1', 'chr2', 'chr3'] - clengths = { 'chr1': 10000, 'chr2': 8000, 'chr3': 6000 } + +def test_bigwig_write(tmpdir): + chroms = ["chr1", "chr2", "chr3"] + clengths = {"chr1": 10000, "chr2": 8000, "chr3": 6000} + def genintervals(): import random + for chrom in chroms: clength = clengths[chrom] current = random.randint(0, 300) @@ -49,20 +54,20 @@ def genintervals(): start = end + random.randint(20, 50) intervals = list(genintervals()) - b = pybigtools.open('./test.bigWig', 'w') + b = pybigtools.open(os.path.join(tmpdir, "test.bigWig"), "w") b.write(clengths, iter(intervals)) # If didn't need to test, could also be done like - #b.write(clengths, genintervals()) + # b.write(clengths, genintervals()) - b = pybigtools.open('./test.bigWig') + b = pybigtools.open(os.path.join(tmpdir, "test.bigWig")) i = [] for chrom in chroms: - i.extend(list(b.intervals(chrom))) + i.extend(list(b.records(chrom))) c = 0 for _ in i: c += 1 assert c == len(intervals) - for a,b in zip(i, intervals): + for a, b in zip(i, intervals): assert a[0] == b[1] assert a[1] == b[2] assert math.isclose(a[2], b[3], abs_tol=0.00001) From ac2b9aab654c3a9c3d12bfeb572fb2e32074639a Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Wed, 10 Apr 2024 13:09:52 -0400 Subject: [PATCH 17/30] docs: Numpy-style docstrings for methods Update docstrings --- pybigtools/src/lib.rs | 370 +++++++++++++++++++++++++++++++----------- 1 file changed, 272 insertions(+), 98 deletions(-) diff --git a/pybigtools/src/lib.rs b/pybigtools/src/lib.rs index b22393e..aff15eb 100644 --- a/pybigtools/src/lib.rs +++ b/pybigtools/src/lib.rs @@ -933,7 +933,7 @@ fn to_entry_array_zoom>>( Ok(()) } -/// This class is the interface for reading a bigWig or bigBed. +/// Interface for reading a BigWig or BigBed file. #[pyclass(module = "pybigtools")] struct BBIRead { bbi: BBIReadRaw, @@ -981,6 +981,7 @@ impl BBIRead { } } + /// Return a dict of information about the BBI file. fn info(&mut self, py: Python<'_>) -> PyResult { let (info, summary) = match &mut self.bbi { BBIReadRaw::Closed => return Err(BBIFileClosed::new_err("File is closed.")), @@ -1043,6 +1044,8 @@ impl BBIRead { Ok(info) } + /// Return a list of sizes in bases of the summary intervals used in each + /// of the zoom levels (i.e. reduction levels) of the BBI file. fn zooms(&self) -> PyResult> { let zooms = match &self.bbi { BBIReadRaw::Closed => return Err(BBIFileClosed::new_err("File is closed.")), @@ -1058,20 +1061,38 @@ impl BBIRead { Ok(zooms.iter().map(|z| z.reduction_level).collect()) } - /// Returns the autosql of this bbi file. + /// Return the autoSql schema definition of this BBI file. /// - /// For bigBeds, this comes directly from the autosql stored in the file. - /// For bigWigs, the autosql returned matches that of a bedGraph file. + /// For BigBeds, this schema comes directly from the autoSql string stored + /// in the file. For BigWigs, the schema generated describes a bedGraph + /// file. /// - /// By default, the autosql is returned as a string. Passing `parse = true` - /// returns instead a dictionary of the format: - /// ``` - /// { - /// "name": , - /// "comment": , - /// "fields": [(, , ), ...], - /// } - /// ``` + /// Parameters + /// ---------- + /// parse : bool, optional [default: False] + /// If True, return the schema as a dictionary. If False, return the + /// schema as a string. Default is False. + /// + /// Returns + /// ------- + /// schema : str or dict + /// The autoSql schema of the BBI file. If `parse` is True, the schema + /// is returned as a dictionary of the format: + /// + /// ``` + /// { + /// "name": , + /// "comment": , + /// "fields": [(, , ), ...], + /// } + /// ``` + /// + /// See Also + /// -------- + /// is_bigwig : Check if the BBI file is a bigWig. + /// is_bigbed : Check if the BBI file is a bigBed. + /// info : Get information about the BBI file. + /// zooms : Get the zoom levels of the BBI file. #[pyo3(signature = (parse = false))] fn sql(&mut self, py: Python, parse: bool) -> PyResult { pub const BEDGRAPH: &str = r#"table bedGraph @@ -1126,19 +1147,36 @@ impl BBIRead { Ok(obj) } - /// Returns the records of a given range on a chromosome. + /// Return the records of a given range on a chromosome. /// - /// The result is an iterator of tuples. For bigWigs, these tuples are in - /// the format (start: int, end: int, value: float). For bigBeds, these + /// The result is an iterator of tuples. For BigWigs, these tuples are in + /// the format (start: int, end: int, value: float). For BigBeds, these /// tuples are in the format (start: int, end: int, ...), where the "rest" /// fields are split by whitespace. /// - /// Missing values in bigWigs will results in non-contiguous records. + /// Parameters + /// ---------- + /// chrom : str + /// Name of the chromosome. + /// start, end : int, optional + /// The range to get values for. If end is not provided, it defaults to + /// the length of the chromosome. If start is not provided, it defaults + /// to the beginning of the chromosome. + /// + /// Returns + /// ------- + /// Iterator[tuple[int, int, float] or tuple[int, int, ...]] + /// An iterator of tuples in the format (start: int, end: int, value: + /// float) for BigWigs, or (start: int, end: int, *rest) for BigBeds. + /// + /// Notes + /// ----- + /// Missing values in BigWigs will results in non-contiguous records. /// - /// The chrom argument is the name of the chromosome. - /// The start and end arguments denote the range to get values for. - /// If end is not provided, it defaults to the length of the chromosome. - /// If start is not provided, it defaults to the beginning of the chromosome. + /// See Also + /// -------- + /// zoom_records : Get the zoom records of a given range on a chromosome. + /// values : Get the values of a given range on a chromosome. fn records( &mut self, py: Python<'_>, @@ -1196,15 +1234,52 @@ impl BBIRead { } } - /// Returns the zoom records of a given range on a chromosome for a given zoom level. + /// Return the zoom records of a given range on a chromosome for a given + /// zoom level. /// /// The result is an iterator of tuples. These tuples are in the format /// (start: int, end: int, summary: dict). /// - /// The chrom argument is the name of the chromosome. - /// The start and end arguments denote the range to get values for. - /// If end is not provided, it defaults to the length of the chromosome. - /// If start is not provided, it defaults to the beginning of the chromosome. + /// Parameters + /// ---------- + /// reduction_level : int + /// The zoom level to use, as a resolution in bases. Use the ``zooms`` + /// method to get a list of available zoom levels. + /// chrom : str + /// Name of the chromosome. + /// start, end : int, optional + /// The range to get values for. If end is not provided, it defaults + /// to the length of the chromosome. If start is not provided, it + /// defaults to the beginning of the chromosome. + /// + /// Returns + /// ------- + /// Iterator[tuple[int, int, dict]] + /// An iterator of tuples in the format (start: int, end: int, + /// summary: dict). + /// + /// Notes + /// ----- + /// The summary dictionary contains the following keys + /// + /// - ``total_items``: The number of items in the interval. + /// - ``bases_covered``: The number of bases covered by the interval. + /// - ``min_val``: The minimum value in the interval. + /// - ``max_val``: The maximum value in the interval. + /// - ``sum``: The sum of all values in the interval. + /// - ``sum_squares``: The sum of the squares of all values in the interval. + /// + /// For BigWigs, the summary statistics are derived from the unique + /// **signal values** associated with each base in the interval. + /// + /// For BigBeds, the summary statistics instead are derived from the + /// **number of BED intervals** overlapping each base in the interval. + /// + /// See Also + /// -------- + /// zooms : Get a list of available zoom levels. + /// records : Get the records of a given range on a chromosome. + /// values : Get the values of a given range on a chromosome. fn zoom_records( &mut self, reduction_level: u32, @@ -1274,21 +1349,66 @@ impl BBIRead { } } - /// Returns the values of a given range on a chromosome. + /// Return the values of a given range on a chromosome as a numpy array. + /// + /// For BigWigs, the returned values or summary statistics are derived + /// from the unique **signal values** associated with each base. + /// + /// For BigBeds, the returned values or summary statistics instead are + /// derived from the **number of BED intervals** overlapping each base. /// - /// For bigWigs, the result is an array of length (end - start). - /// If a value does not exist in the bigwig for a specific base, it will be nan. + /// Parameters + /// ---------- + /// chrom : str + /// Name of the chromosome. + /// start, end : int, optional + /// The range to get values for. If end is not provided, it defaults + /// to the length of the chromosome. If start is not provided, it + /// defaults to the beginning of the chromosome. + /// bins : int, optional + /// If provided, the query interval will be divided into equally spaced + /// bins and the values in each bin will be interpolated or summarized. + /// If not provided, the values will be returned for each base. + /// summary : Literal["mean", "min", "max"], optional [default: "mean"] + /// The summary statistic to use. Currently supported statistics are + /// ``mean``, ``min``, and ``max``. + /// exact : bool, optional [default: False] + /// If True and ``bins`` is specified, return exact summary statistic + /// values instead of interpolating from the optimal zoom level. + /// Default is False. + /// missing : float, optional [default: 0.0] + /// Fill-in value for unreported data in valid regions. Default is 0. + /// oob : float, optional [default: NaN] + /// Fill-in value for out-of-bounds regions. Default is NaN. + /// arr : numpy.ndarray, optional + /// If provided, the values will be written to this array or array + /// view. The array must be of the correct size and type. /// - /// For bigBeds, the returned array instead represents a pileup of the count - /// of intervals overlapping each base. + /// Returns + /// ------- + /// numpy.ndarray + /// The signal values of the bigwig or bigbed in the specified range. /// - /// The chrom argument is the name of the chromosome. - /// The start and end arguments denote the range to get values for. - /// If end is not provided, it defaults to the length of the chromosome. - /// If start is not provided, it defaults to the beginning of the chromosome. - /// The default oob value is `numpy.nan`. + /// Notes + /// ----- + /// A BigWig file encodes a step function, and the value at + /// a base is given by the signal value of the unique interval that + /// contains that base. /// - /// This returns a numpy array. + /// A BigBed file encodes a collection of (possibly overlapping) intervals + /// which may or may not be associated with quantitative scores. The + /// "value" at given base used here summarizes the number of intervals + /// overlapping that base, not any particular score. + /// + /// If a number of bins is requested and ``exact`` is False, the summarized + /// data is interpolated from the closest available zoom level. If you + /// need accurate summary data and are okay with small trade-off in speed, + /// set ``exact`` to True. + /// + /// See Also + /// -------- + /// records : Get the records of a given range on a chromosome. + /// zoom_records : Get the zoom records of a given range on a chromosome. #[pyo3( signature = (chrom, start, end, bins=None, summary="mean".to_string(), exact=false, missing=0.0, oob=f64::NAN, arr=None), text_signature = r#"(chrom, start, end, bins=None, summary="mean", exact=False, missing=0.0, oob=..., arr=None)"#, @@ -1341,12 +1461,19 @@ impl BBIRead { } } - /// Returns the chromosomes in a bigwig, and their lengths. + /// Return the names of chromosomes in a BBI file and their lengths. /// - /// The chroms argument can be either String or None. - /// If it is None, then all chroms will be returned. - /// If it is a String, then the length of that chromosome will be returned. - /// If the chromosome doesn't exist, nothing will be returned. + /// Parameters + /// ---------- + /// chrom : str or None + /// The name of the chromosome to get the length of. If None, then a + /// dictionary of all chromosome sizes will be returned. If the + /// chromosome doesn't exist, returns None. + /// + /// Returns + /// ------- + /// int or Dict[str, int] or None: + /// Chromosome length or a dictionary of chromosome lengths. fn chroms(&mut self, py: Python, chrom: Option) -> PyResult> { fn get_chrom_obj( b: &B, @@ -1383,25 +1510,38 @@ impl BBIRead { } /// Gets the average values from a bigWig over the entries of a bed file. - /// Raises an exception if the current file is a bigBed. /// - /// Parameters: - /// bed (str): The path to the bed. - /// names (None, bool, or int): - /// If `None`, then each return value will be a single `float`, - /// the average value over an interval in the bed file. - /// If `True`, then each return value will be a tuple of the value of column 4 - /// and the average value over the interval with that name in the bed file. - /// If `False`, then each return value will be a tuple of the interval in the format - /// `{chrom}:{start}-{end}` and the average value over that interval. - /// If `0`, then each return value will match as if `False` was passed. - /// If a `1+`, then each return value will be a tuple of the value of column of this - /// parameter (1-based) and the average value over the interval. + /// Parameters + /// ---------- + /// bed : str + /// The path to the bed. + /// names : bool or int, optional + /// If ``None``, then each return value will be a single float, the + /// average value over an interval in the bed file. + /// + /// If ``True``, then each return value will be a tuple of the value of + /// column 4 and the average value over the interval with that name in the + /// bed file. + /// + /// If ``False``, then each return value will be a tuple of the interval + /// in the format ``{chrom}:{start}-{end}`` and the average value over + /// that interval. + /// + /// If ``0``, then each return value will match as if ``False`` was passed. + /// + /// If a ``1+``, then each return value will be a tuple of the value of + /// column of this parameter (1-based) and the average value over the + /// interval. + /// + /// Returns + /// ------- + /// Generator of float or tuple. /// - /// Returns: - /// This returns a generator of values. (Therefore, to save to a list, do `list(bigWigAverageOverBed(...))`) - /// If no name is specified (see the `names` parameter above), then returns a generator of `float`s. - /// If a name column is specified (see above), then returns a generator of tuples `({name}, {average})` + /// Notes + /// ----- + /// If no ``name`` field is specified, returns a generator of floats. + /// If a ``name`` column is specified, returns a generator of tuples + /// ``({name}, {average})``. fn average_over_bed( &mut self, py: Python, @@ -1530,9 +1670,10 @@ impl ZoomIntervalIterator { } } -/// This class is an iterator for `Values` from a bigWig. -/// It returns only values that exist in the bigWig, skipping -/// any missing intervals. +/// An iterator for intervals in a bigWig. +/// +/// It returns only values that exist in the bigWig, skipping any missing +/// intervals. #[pyclass(module = "pybigtools")] struct BigWigIntervalIterator { iter: Box> + Send>, @@ -1553,7 +1694,7 @@ impl BigWigIntervalIterator { } } -/// This class is an interator for the entries in a bigBed file. +/// An iterator for the entries in a bigBed. #[pyclass(module = "pybigtools")] struct BigBedEntriesIterator { iter: Box> + Send>, @@ -1581,7 +1722,7 @@ impl BigBedEntriesIterator { } } -/// This class is the interface for writing a bigWig. +/// Interface for writing to a BigWig file. #[pyclass(module = "pybigtools")] struct BigWigWrite { bigwig: Option, @@ -1589,11 +1730,24 @@ struct BigWigWrite { #[pymethods] impl BigWigWrite { - /// Writes the values passsed to the bigwig file. - /// The underlying file will be closed automatically when the function completes (and no other operations will be able to be performed). + /// Write values to the BigWig file. + /// + /// The underlying file will be closed automatically when the function + /// completes (and no other operations will be able to be performed). /// - /// The chroms argument should be a dictionary with keys as chromosome names and values as their length. - /// The vals argument should be an iterable with values (String, int, int, float) that represents each value to write in the format (chromosome, start, end, value). + /// Parameters + /// ---------- + /// chroms : Dict[str, int] + /// A dictionary with keys as chromosome names and values as their + /// length. + /// vals : Iterable[tuple[str, int, int, float]] + /// An iterable with values that represents each value to write in the + /// format (chromosome, start, end, value). + /// + /// Notes + /// ----- + /// The underlying file will be closed automatically when the function + /// completes, and no other operations will be able to be performed. fn write(&mut self, py: Python, chroms: &PyDict, vals: Py) -> PyResult<()> { let runtime = runtime::Builder::new_multi_thread() .worker_threads( @@ -1709,18 +1863,17 @@ impl BigWigWrite { }) } - /// close() - /// -- + /// Close the file. /// - /// Manually closed the file. - /// No other operations will be allowed after it is closed. This is done automatically after write is performed. + /// No other operations will be allowed after it is closed. This is done + /// automatically after write is performed. fn close(&mut self) -> PyResult<()> { self.bigwig.take(); Ok(()) } } -/// This class is the interface for writing to a bigBed. +/// Interface for writing to a BigBed file. #[pyclass(module = "pybigtools")] struct BigBedWrite { bigbed: Option, @@ -1728,10 +1881,25 @@ struct BigBedWrite { #[pymethods] impl BigBedWrite { - /// Writes the values passsed to the bigwig file. The underlying file will be closed automatically when the function completes (and no other operations will be able to be performed). - /// The chroms argument should be a dictionary with keys as chromosome names and values as their length. - /// The vals argument should be an iterable with values (String, int, int, String) that represents each value to write in the format (chromosome, start, end, rest) - /// The rest String should consist of tab-delimited fields. + /// Write values to the BigBed file. + /// + /// The underlying file will be closed automatically when the function + /// completes (and no other operations will be able to be performed). + /// + /// Parameters + /// ---------- + /// chroms : Dict[str, int] + /// A dictionary with keys as chromosome names and values as their + /// length. + /// vals : Iterable[tuple[str, int, int, str]] + /// An iterable with values that represents each value to write in the + /// format (chromosome, start, end, rest). The ``rest`` string should + /// consist of tab-delimited fields. + /// + /// Notes + /// ----- + /// The underlying file will be closed automatically when the function + /// completes, and no other operations will be able to be performed. fn write(&mut self, py: Python, chroms: &PyDict, vals: Py) -> PyResult<()> { let runtime = runtime::Builder::new_multi_thread() .worker_threads( @@ -1850,7 +2018,10 @@ impl BigBedWrite { }) } - /// Manually closed the file. No other operations will be allowed after it is closed. This is done automatically after write is performed. + /// Close the file. + /// + /// No other operations will be allowed after it is closed. This is done + /// automatically after write is performed. fn close(&mut self) -> PyResult<()> { self.bigbed.take(); Ok(()) @@ -1908,26 +2079,29 @@ impl BigWigAverageOverBedEntriesIterator { } } -/// This is the entrypoint for working with bigWigs or bigBeds. +/// Open a BigWig or BigBed file for reading or writing. /// -/// The first argument can take one of three values: -/// 1) A path to a file as a string -/// 2) An http url for a remote file as a string -/// 3) A file-like object with a `read` and `seek` method +/// Parameters +/// ---------- +/// path_url_or_file_like : str or file-like object +/// The path to a file or an http url for a remote file as a string, or +/// a Python file-like object with ``read`` and ``seek`` methods. +/// mode : Literal["r", "w"], optional [default: "r"] +/// The mode to open the file in. If not provided, it will default to read. +/// "r" will open a bigWig/bigBed for reading but will not allow writing. +/// "w" will open a bigWig/bigBed for writing but will not allow reading. /// -/// When writing, only a file path can be used. +/// Returns +/// ------- +/// BigWigWrite or BigBedWrite or BBIRead +/// The object for reading or writing the BigWig or BigBed file. /// -/// The mode is either "w", "r", or None. "r" or None will open a -/// bigWig or bigBed for reading (but will not allow writing). -/// "w" Will open a bigWig/bigBed for writing (but will not allow reading). +/// Notes +/// ----- +/// For writing, only a file path is currently accepted. /// -/// If passing a file-like object, simultaneous reading of different intervals +/// If passing a file-like object, concurrent reading of different intervals /// is not supported and may result in incorrect behavior. -/// -/// This returns one of the following: -/// - `BigWigWrite` -/// - `BigBedWrite` -/// - `BBIRead` #[pyfunction] fn open(py: Python, path_url_or_file_like: PyObject, mode: Option) -> PyResult { let iswrite = match &mode { @@ -2105,22 +2279,22 @@ fn open_path_or_url( Ok(res) } -/// The base module for opening a bigWig or bigBed. The only defined function is `open`. +/// Read and write Big Binary Indexed (BBI) file types: BigWig and BigBed. #[pymodule] fn pybigtools(_py: Python, m: &PyModule) -> PyResult<()> { m.add("__version__", env!("CARGO_PKG_VERSION"))?; - m.add("BBIFileClosed", m.py().get_type::())?; - m.add("BBIReadError", m.py().get_type::())?; m.add_wrapped(wrap_pyfunction!(open))?; + m.add_class::()?; m.add_class::()?; m.add_class::()?; - m.add_class::()?; - m.add_class::()?; m.add_class::()?; + m.add("BBIFileClosed", m.py().get_type::())?; + m.add("BBIReadError", m.py().get_type::())?; + Ok(()) } From 7ee1ab7c48bd926ab0687f66881f505a4e8f5f3f Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Thu, 11 Apr 2024 19:45:55 -0400 Subject: [PATCH 18/30] feat: Make chrom not found raise a KeyError instead of returning None --- pybigtools/src/lib.rs | 42 ++++++++++++++++++++++-------------- pybigtools/tests/test_api.py | 6 +++--- 2 files changed, 29 insertions(+), 19 deletions(-) diff --git a/pybigtools/src/lib.rs b/pybigtools/src/lib.rs index aff15eb..4250886 100644 --- a/pybigtools/src/lib.rs +++ b/pybigtools/src/lib.rs @@ -25,7 +25,7 @@ use bigtools::utils::reopen::Reopen; use file_like::PyFileLikeObject; use numpy::ndarray::ArrayViewMut; use numpy::PyArray1; -use pyo3::exceptions::{self, PyTypeError}; +use pyo3::exceptions::{self, PyKeyError, PyTypeError}; use pyo3::types::{IntoPyDict, PyAny, PyDict, PyFloat, PyInt, PyIterator, PyString, PyTuple}; use pyo3::{create_exception, wrap_pyfunction}; use pyo3::{prelude::*, PyTraverseError, PyVisit}; @@ -1474,30 +1474,40 @@ impl BBIRead { /// ------- /// int or Dict[str, int] or None: /// Chromosome length or a dictionary of chromosome lengths. - fn chroms(&mut self, py: Python, chrom: Option) -> PyResult> { + fn chroms(&mut self, py: Python, chrom: Option) -> PyResult { fn get_chrom_obj( b: &B, py: Python, chrom: Option, - ) -> Option { + ) -> PyResult { match chrom { - Some(chrom) => b - .chroms() - .into_iter() - .find(|c| c.name == chrom) - .map(|c| c.length) - .map(|c| c.to_object(py)), - None => Some( - b.chroms() + Some(chrom) => { + let chrom_length = b + .chroms() + .into_iter() + .find(|c| c.name == chrom) + .ok_or_else(|| { + PyErr::new::( + "No chromosome found with the specified name", + ) + }) + .map(|c| c.length.to_object(py))?; + Ok(chrom_length) + } + None => { + let chrom_dict: PyObject = b + .chroms() .into_iter() .map(|c| (c.name.clone(), c.length)) .into_py_dict(py) - .into(), - ), + .into(); + Ok(chrom_dict) + } } } - Ok(match &self.bbi { - BBIReadRaw::Closed => return Err(BBIFileClosed::new_err("File is closed.")), + + match &self.bbi { + BBIReadRaw::Closed => Err(BBIFileClosed::new_err("File is closed.")), BBIReadRaw::BigWigFile(b) => get_chrom_obj(b, py, chrom), #[cfg(feature = "remote")] BBIReadRaw::BigWigRemote(b) => get_chrom_obj(b, py, chrom), @@ -1506,7 +1516,7 @@ impl BBIRead { #[cfg(feature = "remote")] BBIReadRaw::BigBedRemote(b) => get_chrom_obj(b, py, chrom), BBIReadRaw::BigBedFileLike(b) => get_chrom_obj(b, py, chrom), - }) + } } /// Gets the average values from a bigWig over the entries of a bed file. diff --git a/pybigtools/tests/test_api.py b/pybigtools/tests/test_api.py index 4d3f29b..bd0bb88 100644 --- a/pybigtools/tests/test_api.py +++ b/pybigtools/tests/test_api.py @@ -74,9 +74,9 @@ def test_chroms(bw, bb): assert bw.chroms("chr17") == 83257441 assert bb.chroms("chr21") == 48129895 - # Missing chrom => None - assert bw.chroms("chr11") is None - assert bb.chroms("chr11") is None + # Missing chrom => KeyError + pytest.raises(KeyError, bw.chroms, "chr11") + pytest.raises(KeyError, bb.chroms, "chr11") def test_zooms(bw, bb): From 3f52238915859d117fb3c386b3e99a43486b577e Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Thu, 11 Apr 2024 20:57:35 -0400 Subject: [PATCH 19/30] feat: Support pathlib.Path as input to open --- pybigtools/src/lib.rs | 7 +++++++ pybigtools/tests/test_api.py | 6 ++++++ 2 files changed, 13 insertions(+) diff --git a/pybigtools/src/lib.rs b/pybigtools/src/lib.rs index 4250886..45ffaef 100644 --- a/pybigtools/src/lib.rs +++ b/pybigtools/src/lib.rs @@ -2131,6 +2131,13 @@ fn open(py: Python, path_url_or_file_like: PyObject, mode: Option) -> Py return open_path_or_url(py, string_ref.to_str().unwrap().to_owned(), iswrite); } + // If pathlib.Path, convert to string and try to open + let path_class = py.import("pathlib")?.getattr("Path")?; + if path_url_or_file_like.as_ref(py).is_instance(path_class)? { + let path_str = path_url_or_file_like.as_ref(py).str()?.to_str()?; + return open_path_or_url(py, path_str.to_owned(), iswrite); + } + if iswrite { return Err(PyErr::new::(format!( "Writing only supports file names", diff --git a/pybigtools/tests/test_api.py b/pybigtools/tests/test_api.py index bd0bb88..76184d1 100644 --- a/pybigtools/tests/test_api.py +++ b/pybigtools/tests/test_api.py @@ -17,6 +17,12 @@ def test_open_close(): b.close() assert pytest.raises(pybigtools.BBIFileClosed, b.chroms) + # Works with pathlib.Path + path = REPO_ROOT / "bigtools/resources/test/valid.bigWig" + b = pybigtools.open(path, "r") + b.close() + assert pytest.raises(pybigtools.BBIFileClosed, b.chroms) + # Files are closed when exiting a context manager with pybigtools.open(path, "r") as b: pass From 29684d67ce7db7b6d3dda192cecafd806a705286 Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Fri, 12 Apr 2024 04:47:22 -0400 Subject: [PATCH 20/30] test: Add url and filelikes to tests ci: Install pybigtools[test] for testing --- .github/workflows/ci.yml | 4 ++-- pybigtools/pyproject.toml | 24 ++++++++++++++++++++ pybigtools/tests/test_api.py | 44 +++++++++++++++++++++++++++++------- 3 files changed, 62 insertions(+), 10 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 26baa88..7b25fa1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -56,9 +56,9 @@ jobs: - name: Install pybigtools run: | - pip install maturin pytest + pip install maturin cd pybigtools - pip install -e . + pip install -e .[test] - name: Install run: | diff --git a/pybigtools/pyproject.toml b/pybigtools/pyproject.toml index 562e4be..cd8225b 100644 --- a/pybigtools/pyproject.toml +++ b/pybigtools/pyproject.toml @@ -6,10 +6,34 @@ build-backend = "maturin" name = "pybigtools" description = "Python bindings to the Bigtools Rust library for high-performance BigWig and BigBed I/O" license = { text = "MIT" } +keywords = [ + "bigwig", + "bigbed", + "bbi", + "bioinformatics", + "genomics", + "kent", + "ucsc", + "rust", +] +classifiers = [ + "Development Status :: 4 - Beta", + "Operating System :: OS Independent", + "Topic :: Scientific/Engineering :: Bio-Informatics", + "Programming Language :: Rust", + "Programming Language :: Python", + "Programming Language :: Python :: 3", +] dependencies = [ "numpy" ] +[project.optional-dependencies] +test = [ + "pytest", + "smart_open[http]" +] + [project.urls] homepage = "https://github.com/jackh726/bigtools" documentation = "https://bigtools.readthedocs.io" diff --git a/pybigtools/tests/test_api.py b/pybigtools/tests/test_api.py index 76184d1..28d1715 100644 --- a/pybigtools/tests/test_api.py +++ b/pybigtools/tests/test_api.py @@ -4,6 +4,7 @@ import numpy as np import pytest +import smart_open import pybigtools @@ -17,12 +18,6 @@ def test_open_close(): b.close() assert pytest.raises(pybigtools.BBIFileClosed, b.chroms) - # Works with pathlib.Path - path = REPO_ROOT / "bigtools/resources/test/valid.bigWig" - b = pybigtools.open(path, "r") - b.close() - assert pytest.raises(pybigtools.BBIFileClosed, b.chroms) - # Files are closed when exiting a context manager with pybigtools.open(path, "r") as b: pass @@ -33,6 +28,39 @@ def test_open_close(): assert pytest.raises(pybigtools.BBIReadError, pybigtools.open, s, "r") +def test_open_pathlib_path(): + path = REPO_ROOT / "bigtools/resources/test/valid.bigWig" + with pybigtools.open(path, "r") as b: + assert b.chroms() == {"chr17": 83_257_441} + + +def test_open_raw_url(): + url = "http://genome.ucsc.edu/goldenPath/help/examples/bigLollyExample2.bb" + with pybigtools.open(url, "r") as b: + assert b.chroms() == {'chr21': 46_709_983} + + +def test_open_filelike(): + # Regular file + with open(REPO_ROOT / "bigtools/resources/test/valid.bigWig", "rb") as f: + with pybigtools.open(f, "r") as b: + assert b.chroms() == {"chr17": 83_257_441} + + # BytesIO + with open(REPO_ROOT / "bigtools/resources/test/valid.bigWig", "rb") as f: + bw_bytes = f.read() + + with BytesIO(bw_bytes) as f: + with pybigtools.open(f, "r") as b: + assert b.chroms() == {"chr17": 83_257_441} + + # Other file-like objects + url = "http://genome.ucsc.edu/goldenPath/help/examples/bigWigExample.bw" + with smart_open.open(url, "rb") as f: + with pybigtools.open(f, "r") as b: + assert b.chroms() == {'chr21': 48_129_895} + + @pytest.fixture def bw(): path = str(REPO_ROOT / "bigtools/resources/test/valid.bigWig") @@ -77,8 +105,8 @@ def test_chroms(bw, bb): assert bb.chroms() == {"chr21": 48_129_895} # Arg with chrom name => length - assert bw.chroms("chr17") == 83257441 - assert bb.chroms("chr21") == 48129895 + assert bw.chroms("chr17") == 83_257_441 + assert bb.chroms("chr21") == 48_129_895 # Missing chrom => KeyError pytest.raises(KeyError, bw.chroms, "chr11") From f46571ae9375df26501a00c9aff0595b613fb3f8 Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Sat, 13 Apr 2024 10:57:50 -0400 Subject: [PATCH 21/30] Add doc string to custom exceptions --- pybigtools/src/lib.rs | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/pybigtools/src/lib.rs b/pybigtools/src/lib.rs index 45ffaef..93dcf17 100644 --- a/pybigtools/src/lib.rs +++ b/pybigtools/src/lib.rs @@ -37,8 +37,18 @@ mod file_like; type ValueTuple = (u32, u32, f32); -create_exception!(pybigtools, BBIFileClosed, exceptions::PyException); -create_exception!(pybigtools, BBIReadError, exceptions::PyException); +create_exception!( + pybigtools, + BBIFileClosed, + exceptions::PyException, + "BBI File is closed." +); +create_exception!( + pybigtools, + BBIReadError, + exceptions::PyException, + "Error reading BBI file." +); fn start_end( bbi: &BBIReadRaw, @@ -209,7 +219,7 @@ fn intervals_to_array( Some(bins) => { if v.len() != bins { return Err(PyErr::new::(format!( - "`arr` does not the expected size (expected `{}`, found `{}`), if passed.", + "`arr` does not have the expected size (expected `{}`, found `{}`), if passed.", bins, v.len(), ))); @@ -303,6 +313,7 @@ fn intervals_to_array( Ok(arr) } + fn entries_to_array( py: Python<'_>, b: &mut BigBedReadRaw, @@ -458,6 +469,7 @@ fn to_array>>( } Ok(()) } + fn to_array_bins>>( start: i32, end: i32, @@ -564,6 +576,7 @@ fn to_array_bins>>( } Ok(()) } + fn to_array_zoom>>( start: i32, end: i32, @@ -681,6 +694,7 @@ fn to_array_zoom>>( } Ok(()) } + fn to_entry_array>>( start: i32, end: i32, @@ -700,6 +714,7 @@ fn to_entry_array>>( } Ok(()) } + fn to_entry_array_bins>>( start: i32, end: i32, From d04cc4cb1cd5592e89d1dbdc809c8a082586adf5 Mon Sep 17 00:00:00 2001 From: Nezar Abdennur Date: Sat, 13 Apr 2024 10:59:52 -0400 Subject: [PATCH 22/30] fix: Make chrom and end optional for values() --- pybigtools/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pybigtools/src/lib.rs b/pybigtools/src/lib.rs index 93dcf17..15e1bdc 100644 --- a/pybigtools/src/lib.rs +++ b/pybigtools/src/lib.rs @@ -1425,7 +1425,7 @@ impl BBIRead { /// records : Get the records of a given range on a chromosome. /// zoom_records : Get the zoom records of a given range on a chromosome. #[pyo3( - signature = (chrom, start, end, bins=None, summary="mean".to_string(), exact=false, missing=0.0, oob=f64::NAN, arr=None), + signature = (chrom, start=None, end=None, bins=None, summary="mean".to_string(), exact=false, missing=0.0, oob=f64::NAN, arr=None), text_signature = r#"(chrom, start, end, bins=None, summary="mean", exact=False, missing=0.0, oob=..., arr=None)"#, )] fn values( From 64dcb9f92487d1ed0f798c8ffecff5eb34bf26cc Mon Sep 17 00:00:00 2001 From: Jack Huey <31162821+jackh726@users.noreply.github.com> Date: Mon, 8 Apr 2024 16:50:24 -0400 Subject: [PATCH 23/30] Fix test --- pybigtools/src/lib.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pybigtools/src/lib.rs b/pybigtools/src/lib.rs index 15e1bdc..856e008 100644 --- a/pybigtools/src/lib.rs +++ b/pybigtools/src/lib.rs @@ -1117,8 +1117,7 @@ impl BBIRead { uint chromStart; "Start position in chromosome" uint chromEnd; "End position in chromosome" float value; "Value for a given interval" -)\ - "#; +)"#; let schema = match &mut self.bbi { BBIReadRaw::Closed => return Err(BBIFileClosed::new_err("File is closed.")), BBIReadRaw::BigWigFile(_) | BBIReadRaw::BigWigFileLike(_) => BEDGRAPH.to_string(), From bff95bd4de0c02d00c07e4f006aab743f1c6d420 Mon Sep 17 00:00:00 2001 From: Jack Huey <31162821+jackh726@users.noreply.github.com> Date: Mon, 13 May 2024 10:00:27 -0400 Subject: [PATCH 24/30] Use nanmean instead of mean --- pybigtools/src/lib.rs | 104 ++++++++++++++++++++--------------- pybigtools/tests/test_api.py | 8 +-- 2 files changed, 65 insertions(+), 47 deletions(-) diff --git a/pybigtools/src/lib.rs b/pybigtools/src/lib.rs index 856e008..4695de2 100644 --- a/pybigtools/src/lib.rs +++ b/pybigtools/src/lib.rs @@ -482,7 +482,7 @@ fn to_array_bins>>( assert_eq!(v.len(), bins); v.fill(missing); - let mut bin_data: VecDeque<(usize, i32, i32, Option)> = VecDeque::new(); + let mut bin_data: VecDeque<(usize, i32, i32, Option<(i32, f64)>)> = VecDeque::new(); let bin_size = (end - start) as f64 / bins as f64; for interval in iter { let interval = interval?; @@ -498,13 +498,13 @@ fn to_array_bins>>( match summary { Summary::Min => { - v[bin] = front.3.unwrap_or(missing); + v[bin] = front.3.map(|v| v.1).unwrap_or(missing); } Summary::Max => { - v[bin] = front.3.unwrap_or(missing); + v[bin] = front.3.map(|v| v.1).unwrap_or(missing); } Summary::Mean => { - v[bin] = front.3.map(|v| v / bin_size).unwrap_or(missing); + v[bin] = front.3.map(|(c, v)| v / (c as f64)).unwrap_or(missing); } } } else { @@ -534,13 +534,13 @@ fn to_array_bins>>( assert!(bin_data.iter().find(|b| b.0 == bin).is_some()); } for (_bin, bin_start, bin_end, data) in bin_data.iter_mut() { - let v = data.get_or_insert_with(|| { + let (c, v) = data.get_or_insert_with(|| { match summary { // min & max are defined for NAN and we are about to set it // can't use 0.0 because it may be either below or above the real value - Summary::Min | Summary::Max => f64::NAN, + Summary::Min | Summary::Max => (0, f64::NAN), // addition is not defined for NAN - Summary::Mean => 0.0, + Summary::Mean => (0, 0.0), } }); match summary { @@ -555,6 +555,7 @@ fn to_array_bins>>( let overlap_end = (*bin_end).min(interval_end); let overlap_size: i32 = overlap_end - overlap_start; *v += (overlap_size as f64) * interval.value as f64; + *c += overlap_size; } } } @@ -564,13 +565,13 @@ fn to_array_bins>>( match summary { Summary::Min => { - v[bin] = front.3.unwrap_or(missing); + v[bin] = front.3.map(|v| v.1).unwrap_or(missing); } Summary::Max => { - v[bin] = front.3.unwrap_or(missing); + v[bin] = front.3.map(|v| v.1).unwrap_or(missing); } Summary::Mean => { - v[bin] = front.3.map(|v| v / bin_size).unwrap_or(missing); + v[bin] = front.3.map(|(c, v)| v / (c as f64)).unwrap_or(missing); } } } @@ -589,7 +590,8 @@ fn to_array_zoom>>( assert_eq!(v.len(), bins); v.fill(missing); - let mut bin_data: VecDeque<(usize, i32, i32, Option)> = VecDeque::new(); + // (bin, bin_start, bin_end, Option<(covered_bases, value)>) + let mut bin_data: VecDeque<(usize, i32, i32, Option<(i32, f64)>)> = VecDeque::new(); let bin_size = (end - start) as f64 / bins as f64; for interval in iter { let interval = interval?; @@ -605,13 +607,13 @@ fn to_array_zoom>>( match summary { Summary::Min => { - v[bin] = front.3.unwrap_or(missing); + v[bin] = front.3.map(|v| v.1).unwrap_or(missing); } Summary::Max => { - v[bin] = front.3.unwrap_or(missing); + v[bin] = front.3.map(|v| v.1).unwrap_or(missing); } Summary::Mean => { - v[bin] = front.3.map(|v| v / bin_size).unwrap_or(missing); + v[bin] = front.3.map(|(c, v)| v / (c as f64)).unwrap_or(missing); } } } else { @@ -641,37 +643,37 @@ fn to_array_zoom>>( assert!(bin_data.iter().find(|b| b.0 == bin).is_some()); } for (_bin, bin_start, bin_end, data) in bin_data.iter_mut() { + let overlap_start = (*bin_start).max(interval_start); + let overlap_end = (*bin_end).min(interval_end); + let overlap_size: i32 = overlap_end - overlap_start; match data.as_mut() { - Some(v) => match summary { + Some((c, v)) => match summary { Summary::Min => { *v = v.min(interval.summary.min_val as f64); + *c += overlap_size; } Summary::Max => { *v = v.max(interval.summary.max_val as f64); + *c += overlap_size; } Summary::Mean => { - let overlap_start = (*bin_start).max(interval_start); - let overlap_end = (*bin_end).min(interval_end); - let overlap_size = overlap_end - overlap_start; let zoom_mean = (interval.summary.sum as f64) / (interval.summary.bases_covered as f64); *v += (overlap_size as f64) * zoom_mean; + *c += overlap_size; } }, None => match summary { Summary::Min => { - *data = Some(interval.summary.min_val as f64); + *data = Some((overlap_size, interval.summary.min_val as f64)); } Summary::Max => { - *data = Some(interval.summary.max_val as f64); + *data = Some((overlap_size, interval.summary.max_val as f64)); } Summary::Mean => { - let overlap_start = (*bin_start).max(interval_start); - let overlap_end = (*bin_end).min(interval_end); - let overlap_size = overlap_end - overlap_start; let zoom_mean = (interval.summary.sum as f64) / (interval.summary.bases_covered as f64); - *data = Some((overlap_size as f64) * zoom_mean); + *data = Some((overlap_size, (overlap_size as f64) * zoom_mean)); } }, } @@ -682,13 +684,13 @@ fn to_array_zoom>>( match summary { Summary::Min => { - v[bin] = front.3.unwrap_or(missing); + v[bin] = front.3.map(|v| v.1).unwrap_or(missing); } Summary::Max => { - v[bin] = front.3.unwrap_or(missing); + v[bin] = front.3.map(|v| v.1).unwrap_or(missing); } Summary::Mean => { - v[bin] = front.3.map(|v| v / bin_size).unwrap_or(missing); + v[bin] = front.3.map(|(c, v)| v / (c as f64)).unwrap_or(missing); } } } @@ -727,7 +729,7 @@ fn to_entry_array_bins>>( assert_eq!(v.len(), bins); v.fill(missing); - let mut bin_data: VecDeque<(usize, i32, i32, Vec)> = VecDeque::new(); + let mut bin_data: VecDeque<(usize, i32, i32, i32, Vec)> = VecDeque::new(); let bin_size = (end - start) as f64 / bins as f64; for interval in iter { let interval = interval?; @@ -744,20 +746,23 @@ fn to_entry_array_bins>>( match summary { Summary::Min => { v[bin] = front - .3 + .4 .into_iter() .reduce(|min, v| min.min(v)) .unwrap_or(missing); } Summary::Max => { v[bin] = front - .3 + .4 .into_iter() .reduce(|max, v| max.max(v)) .unwrap_or(missing); } Summary::Mean => { - v[bin] = front.3.into_iter().sum::() / bin_size; + v[bin] = (front.3 != 0) + .then(|| front.3 as f64) + .map(|c| front.4.into_iter().sum::() / c) + .unwrap_or(missing); } } } else { @@ -776,6 +781,7 @@ fn to_entry_array_bins>>( bin, bin_start, bin_end, + 0, vec![missing; (bin_end - bin_start) as usize], )); } @@ -791,7 +797,7 @@ fn to_entry_array_bins>>( for bin in bin_start..bin_end { assert!(bin_data.iter().find(|b| b.0 == bin).is_some()); } - for (_bin, bin_start, bin_end, data) in bin_data.iter_mut() { + for (_bin, bin_start, bin_end, covered, data) in bin_data.iter_mut() { let overlap_start = (*bin_start).max(interval_start); let overlap_end = (*bin_end).min(interval_end); @@ -801,6 +807,7 @@ fn to_entry_array_bins>>( // If NAN, then 0.0 + 1.0, else i + 1.0 *i = (*i).max(0.0) + 1.0; } + *covered += overlap_end - overlap_start; } } while let Some(front) = bin_data.pop_front() { @@ -809,20 +816,23 @@ fn to_entry_array_bins>>( match summary { Summary::Min => { v[bin] = front - .3 + .4 .into_iter() .reduce(|min, v| min.min(v)) .unwrap_or(missing); } Summary::Max => { v[bin] = front - .3 + .4 .into_iter() .reduce(|max, v| max.max(v)) .unwrap_or(missing); } Summary::Mean => { - v[bin] = front.3.into_iter().sum::() / bin_size; + v[bin] = (front.3 != 0) + .then(|| front.3 as f64) + .map(|c| front.4.into_iter().sum::() / c) + .unwrap_or(missing); } } } @@ -841,7 +851,7 @@ fn to_entry_array_zoom>>( assert_eq!(v.len(), bins); v.fill(missing); - let mut bin_data: VecDeque<(usize, i32, i32, Vec)> = VecDeque::new(); + let mut bin_data: VecDeque<(usize, i32, i32, i32, Vec)> = VecDeque::new(); let bin_size = (end - start) as f64 / bins as f64; for interval in iter { let interval = interval?; @@ -858,20 +868,23 @@ fn to_entry_array_zoom>>( match summary { Summary::Min => { v[bin] = front - .3 + .4 .into_iter() .reduce(|min, v| min.min(v)) .unwrap_or(missing); } Summary::Max => { v[bin] = front - .3 + .4 .into_iter() .reduce(|max, v| max.max(v)) .unwrap_or(missing); } Summary::Mean => { - v[bin] = front.3.into_iter().sum::() / bin_size; + v[bin] = (front.3 != 0) + .then(|| front.3 as f64) + .map(|c| front.4.into_iter().sum::() / c) + .unwrap_or(missing); } } } else { @@ -890,6 +903,7 @@ fn to_entry_array_zoom>>( bin, bin_start, bin_end, + 0, vec![missing; (bin_end - bin_start) as usize], )); } @@ -905,7 +919,7 @@ fn to_entry_array_zoom>>( for bin in bin_start..bin_end { assert!(bin_data.iter().find(|b| b.0 == bin).is_some()); } - for (_bin, bin_start, bin_end, data) in bin_data.iter_mut() { + for (_bin, bin_start, bin_end, c, data) in bin_data.iter_mut() { let overlap_start = (*bin_start).max(interval_start); let overlap_end = (*bin_end).min(interval_end); @@ -920,6 +934,7 @@ fn to_entry_array_zoom>>( Summary::Max => *i = i.max(interval.summary.max_val), } } + *c += overlap_end - overlap_start; } } while let Some(front) = bin_data.pop_front() { @@ -928,20 +943,23 @@ fn to_entry_array_zoom>>( match summary { Summary::Min => { v[bin] = front - .3 + .4 .into_iter() .reduce(|min, v| min.min(v)) .unwrap_or(missing); } Summary::Max => { v[bin] = front - .3 + .4 .into_iter() .reduce(|max, v| max.max(v)) .unwrap_or(missing); } Summary::Mean => { - v[bin] = front.3.into_iter().sum::() / bin_size; + v[bin] = (front.3 != 0) + .then(|| front.3 as f64) + .map(|c| front.4.into_iter().sum::() / c) + .unwrap_or(missing); } } } diff --git a/pybigtools/tests/test_api.py b/pybigtools/tests/test_api.py index 28d1715..07347dc 100644 --- a/pybigtools/tests/test_api.py +++ b/pybigtools/tests/test_api.py @@ -246,8 +246,8 @@ def test_values(bw, bb): assert len(bw.values("chr17", 100000, 110000, 10)) == 10 assert len(bb.values("chr21", 10_148_000, 10_158_000, 10)) == 10 - assert bw.values("chr17", 100000, 110000, 10)[0] == 0.37435242314338685 - assert bb.values("chr21", 10_148_000, 10_158_000, 10)[0] == 0.175 + assert bw.values("chr17", 100000, 110000, 10)[0] == 0.44886381671868925 + assert bb.values("chr21", 10_148_000, 10_158_000, 10)[0] == 1.0 assert bw.values("chr17", 100000, 110000, 10, "max")[0] == 1.1978399753570557 assert bb.values("chr21", 10_148_000, 10_158_000, 10, "max")[0] == 1.0 @@ -257,11 +257,11 @@ def test_values(bw, bb): assert ( bw.values("chr17", 100000, 110000, 10, "mean", exact=True)[0] - == 0.37885534041374924 + == 0.4542629980980206 ) assert ( bb.values("chr21", 10_148_000, 10_158_000, 10, "mean", exact=True)[0] - == 0.175 + == 1.0 ) assert list(bw.values("chr17", 59890, 59900, 10, "mean", exact=True)) == [ From 7c2df7967a1113525685e2efd947fb261d14afb6 Mon Sep 17 00:00:00 2001 From: Jack Huey <31162821+jackh726@users.noreply.github.com> Date: Sun, 23 Jun 2024 17:11:55 -0400 Subject: [PATCH 25/30] bigBed autosql can be null, so return Option --- bigtools/src/bbi/bigbedread.rs | 7 +++++-- bigtools/src/utils/cli/bigbedinfo.rs | 16 +++++++++++----- bigtools/tests/bigbedwrite.rs | 2 +- pybigtools/src/lib.rs | 6 +++--- pybigtools/tests/test_api.py | 6 +++--- 5 files changed, 23 insertions(+), 14 deletions(-) diff --git a/bigtools/src/bbi/bigbedread.rs b/bigtools/src/bbi/bigbedread.rs index bf8fca4..b26b24a 100644 --- a/bigtools/src/bbi/bigbedread.rs +++ b/bigtools/src/bbi/bigbedread.rs @@ -190,8 +190,11 @@ impl BigBedRead { } /// Reads the autosql from this bigBed - pub fn autosql(&mut self) -> Result { + pub fn autosql(&mut self) -> Result, BBIReadError> { let auto_sql_offset = self.info.header.auto_sql_offset; + if auto_sql_offset == 0 { + return Ok(None); + } let reader = self.reader().raw_reader(); let mut reader = BufReader::new(reader); reader.seek(SeekFrom::Start(auto_sql_offset))?; @@ -200,7 +203,7 @@ impl BigBedRead { buffer.pop(); let autosql = String::from_utf8(buffer) .map_err(|_| BBIReadError::InvalidFile("Invalid autosql: not UTF-8".to_owned()))?; - Ok(autosql) + Ok(Some(autosql)) } pub fn item_count(&mut self) -> Result { diff --git a/bigtools/src/utils/cli/bigbedinfo.rs b/bigtools/src/utils/cli/bigbedinfo.rs index f43d018..46431a2 100644 --- a/bigtools/src/utils/cli/bigbedinfo.rs +++ b/bigtools/src/utils/cli/bigbedinfo.rs @@ -83,11 +83,17 @@ pub fn bigbedinfo(args: BigBedInfoArgs) -> Result<(), Box> { } if args.autosql { let autosql = bigbed.autosql()?; - if autosql.len() == 0 { - println!("as: n/a"); - } else { - println!("as:"); - print!("{}", autosql); + match autosql { + None => { + println!("as: n/a"); + } + Some(autosql) if autosql.is_empty() => { + println!("as: n/a"); + } + Some(autosql) => { + println!("as:"); + print!("{}", autosql); + } } } let summary = bigbed.get_summary()?; diff --git a/bigtools/tests/bigbedwrite.rs b/bigtools/tests/bigbedwrite.rs index bc55c42..2ef1216 100644 --- a/bigtools/tests/bigbedwrite.rs +++ b/bigtools/tests/bigbedwrite.rs @@ -60,7 +60,7 @@ fn bigbedwrite_test() -> Result<(), Box> { assert_eq!(chroms[0].length, 83257441); assert_eq!( - &bwread.autosql().unwrap(), + &bwread.autosql().unwrap().unwrap(), "table bed\n\"Browser Extensible Data\"\n(\n string chrom; \"Reference sequence chromosome or scaffold\"\n uint chromStart; \"Start position in chromosome\"\n uint chromEnd; \"End position in chromosome\"\n string name; \"Name of item.\"\n uint score; \"Score (0-1000)\"\n)", ); diff --git a/pybigtools/src/lib.rs b/pybigtools/src/lib.rs index 4695de2..748b931 100644 --- a/pybigtools/src/lib.rs +++ b/pybigtools/src/lib.rs @@ -1141,10 +1141,10 @@ impl BBIRead { BBIReadRaw::BigWigFile(_) | BBIReadRaw::BigWigFileLike(_) => BEDGRAPH.to_string(), #[cfg(feature = "remote")] BBIReadRaw::BigWigRemote(_) => BEDGRAPH.to_string(), - BBIReadRaw::BigBedFile(b) => b.autosql().convert_err()?, + BBIReadRaw::BigBedFile(b) => b.autosql().convert_err()?.unwrap_or(String::new()), #[cfg(feature = "remote")] - BBIReadRaw::BigBedRemote(b) => b.autosql().convert_err()?, - BBIReadRaw::BigBedFileLike(b) => b.autosql().convert_err()?, + BBIReadRaw::BigBedRemote(b) => b.autosql().convert_err()?.unwrap_or(String::new()), + BBIReadRaw::BigBedFileLike(b) => b.autosql().convert_err()?.unwrap_or(String::new()), }; let obj = if parse { let mut declarations = parse_autosql(&schema) diff --git a/pybigtools/tests/test_api.py b/pybigtools/tests/test_api.py index 07347dc..b3ee065 100644 --- a/pybigtools/tests/test_api.py +++ b/pybigtools/tests/test_api.py @@ -123,10 +123,10 @@ def test_autosql(bw, bb): # Even bigwigs have sql (a sql representing bedGraph) assert "bedGraph" in bw.sql() # We can parse the sql - # assert bw.sql(True)['name'] == 'bedGraph' + assert bw.sql(True)['name'] == 'bedGraph' - # bb.sql() - # BBIReadError: The file was invalid: Invalid autosql: not UTF-8 + # Unfortunately, this test bigBed doesn't actually have autosql + assert len(bb.sql()) == 0 def test_records(bw, bb): From c134044b99b8718f5437fbb53b57940d52bb6d8d Mon Sep 17 00:00:00 2001 From: Jack Huey <31162821+jackh726@users.noreply.github.com> Date: Sun, 23 Jun 2024 17:25:41 -0400 Subject: [PATCH 26/30] Bump pybigtools dev minor version --- Cargo.lock | 2 +- pybigtools/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 8b0acc2..75c86e3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -743,7 +743,7 @@ dependencies = [ [[package]] name = "pybigtools" -version = "0.1.5-dev" +version = "0.2.0-dev" dependencies = [ "bigtools", "futures", diff --git a/pybigtools/Cargo.toml b/pybigtools/Cargo.toml index 0806602..38164d1 100644 --- a/pybigtools/Cargo.toml +++ b/pybigtools/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "pybigtools" -version = "0.1.5-dev" +version = "0.2.0-dev" authors = ["Jack "] edition = "2021" From 88e429e558f0b1e45d41d27b7239ad884b773370 Mon Sep 17 00:00:00 2001 From: Jack Huey <31162821+jackh726@users.noreply.github.com> Date: Mon, 24 Jun 2024 00:27:14 -0400 Subject: [PATCH 27/30] Track coverage by base --- pybigtools/src/lib.rs | 63 ++++++++++++++++++++++++++++--------------- 1 file changed, 42 insertions(+), 21 deletions(-) diff --git a/pybigtools/src/lib.rs b/pybigtools/src/lib.rs index 748b931..2a25ad8 100644 --- a/pybigtools/src/lib.rs +++ b/pybigtools/src/lib.rs @@ -729,7 +729,9 @@ fn to_entry_array_bins>>( assert_eq!(v.len(), bins); v.fill(missing); - let mut bin_data: VecDeque<(usize, i32, i32, i32, Vec)> = VecDeque::new(); + // (, , , , ) + // covered_bases = 0 if uncovered, 1 if covered + let mut bin_data: VecDeque<(usize, i32, i32, Vec, Vec)> = VecDeque::new(); let bin_size = (end - start) as f64 / bins as f64; for interval in iter { let interval = interval?; @@ -759,8 +761,11 @@ fn to_entry_array_bins>>( .unwrap_or(missing); } Summary::Mean => { - v[bin] = (front.3 != 0) - .then(|| front.3 as f64) + v[bin] = front + .3 + .iter() + .any(|v| *v > 0) + .then(|| front.3.into_iter().sum::() as f64) .map(|c| front.4.into_iter().sum::() / c) .unwrap_or(missing); } @@ -781,7 +786,7 @@ fn to_entry_array_bins>>( bin, bin_start, bin_end, - 0, + vec![0; (bin_end - bin_start) as usize], vec![missing; (bin_end - bin_start) as usize], )); } @@ -801,13 +806,15 @@ fn to_entry_array_bins>>( let overlap_start = (*bin_start).max(interval_start); let overlap_end = (*bin_end).min(interval_end); - for i in &mut data - [((overlap_start - *bin_start) as usize)..((overlap_end - *bin_start) as usize)] - { + let range = + ((overlap_start - *bin_start) as usize)..((overlap_end - *bin_start) as usize); + for i in &mut data[range.clone()] { // If NAN, then 0.0 + 1.0, else i + 1.0 *i = (*i).max(0.0) + 1.0; } - *covered += overlap_end - overlap_start; + for i in &mut covered[range] { + *i = (*i).max(1); + } } } while let Some(front) = bin_data.pop_front() { @@ -829,8 +836,11 @@ fn to_entry_array_bins>>( .unwrap_or(missing); } Summary::Mean => { - v[bin] = (front.3 != 0) - .then(|| front.3 as f64) + v[bin] = front + .3 + .iter() + .any(|v| *v > 0) + .then(|| front.3.into_iter().sum::() as f64) .map(|c| front.4.into_iter().sum::() / c) .unwrap_or(missing); } @@ -851,7 +861,9 @@ fn to_entry_array_zoom>>( assert_eq!(v.len(), bins); v.fill(missing); - let mut bin_data: VecDeque<(usize, i32, i32, i32, Vec)> = VecDeque::new(); + // (, , , , ) + // covered_bases = 0 if uncovered, 1 if covered + let mut bin_data: VecDeque<(usize, i32, i32, Vec, Vec)> = VecDeque::new(); let bin_size = (end - start) as f64 / bins as f64; for interval in iter { let interval = interval?; @@ -881,8 +893,11 @@ fn to_entry_array_zoom>>( .unwrap_or(missing); } Summary::Mean => { - v[bin] = (front.3 != 0) - .then(|| front.3 as f64) + v[bin] = front + .3 + .iter() + .any(|v| *v > 0) + .then(|| front.3.into_iter().sum::() as f64) .map(|c| front.4.into_iter().sum::() / c) .unwrap_or(missing); } @@ -903,7 +918,7 @@ fn to_entry_array_zoom>>( bin, bin_start, bin_end, - 0, + vec![0; (bin_end - bin_start) as usize], vec![missing; (bin_end - bin_start) as usize], )); } @@ -919,14 +934,14 @@ fn to_entry_array_zoom>>( for bin in bin_start..bin_end { assert!(bin_data.iter().find(|b| b.0 == bin).is_some()); } - for (_bin, bin_start, bin_end, c, data) in bin_data.iter_mut() { + for (_bin, bin_start, bin_end, covered, data) in bin_data.iter_mut() { let overlap_start = (*bin_start).max(interval_start); let overlap_end = (*bin_end).min(interval_end); let mean = interval.summary.sum / (interval.end - interval.start) as f64; - for i in &mut data - [((overlap_start - *bin_start) as usize)..((overlap_end - *bin_start) as usize)] - { + let range = + ((overlap_start - *bin_start) as usize)..((overlap_end - *bin_start) as usize); + for i in &mut data[range.clone()] { *i = i.max(0.0); match summary { Summary::Mean => *i += mean, @@ -934,7 +949,10 @@ fn to_entry_array_zoom>>( Summary::Max => *i = i.max(interval.summary.max_val), } } - *c += overlap_end - overlap_start; + + for i in &mut covered[range] { + *i = (*i).max(1); + } } } while let Some(front) = bin_data.pop_front() { @@ -956,8 +974,11 @@ fn to_entry_array_zoom>>( .unwrap_or(missing); } Summary::Mean => { - v[bin] = (front.3 != 0) - .then(|| front.3 as f64) + v[bin] = front + .3 + .iter() + .any(|v| *v > 0) + .then(|| front.3.into_iter().sum::() as f64) .map(|c| front.4.into_iter().sum::() / c) .unwrap_or(missing); } From f4cae68f35ecd2dd0d21cf7dda5faef86f9627b2 Mon Sep 17 00:00:00 2001 From: Jack Huey <31162821+jackh726@users.noreply.github.com> Date: Wed, 26 Jun 2024 17:51:57 -0400 Subject: [PATCH 28/30] Don't use ceil for bin_start and bin_end --- pybigtools/src/lib.rs | 16 ++++++++-------- pybigtools/tests/test_api.py | 10 ++++++++++ 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/pybigtools/src/lib.rs b/pybigtools/src/lib.rs index 2a25ad8..4ee84f8 100644 --- a/pybigtools/src/lib.rs +++ b/pybigtools/src/lib.rs @@ -516,8 +516,8 @@ fn to_array_bins>>( .map(|b| ((b.0 + 1) < bin_end).then(|| b.0 + 1)) .unwrap_or(Some(bin_start)) { - let bin_start = ((bin as f64) * bin_size).ceil() as i32; - let bin_end = (((bin + 1) as f64) * bin_size).ceil() as i32; + let bin_start = ((bin as f64) * bin_size) as i32; + let bin_end = (((bin + 1) as f64) * bin_size) as i32; bin_data.push_back((bin, bin_start, bin_end, None)); } @@ -625,8 +625,8 @@ fn to_array_zoom>>( .map(|b| ((b.0 + 1) < bin_end).then(|| b.0 + 1)) .unwrap_or(Some(bin_start)) { - let bin_start = ((bin as f64) * bin_size).ceil() as i32; - let bin_end = (((bin + 1) as f64) * bin_size).ceil() as i32; + let bin_start = ((bin as f64) * bin_size) as i32; + let bin_end = (((bin + 1) as f64) * bin_size) as i32; bin_data.push_back((bin, bin_start, bin_end, None)); } @@ -779,8 +779,8 @@ fn to_entry_array_bins>>( .map(|b| ((b.0 + 1) < bin_end).then(|| b.0 + 1)) .unwrap_or(Some(bin_start)) { - let bin_start = ((bin as f64) * bin_size).ceil() as i32; - let bin_end = (((bin + 1) as f64) * bin_size).ceil() as i32; + let bin_start = ((bin as f64) * bin_size) as i32; + let bin_end = (((bin + 1) as f64) * bin_size) as i32; bin_data.push_back(( bin, @@ -911,8 +911,8 @@ fn to_entry_array_zoom>>( .map(|b| ((b.0 + 1) < bin_end).then(|| b.0 + 1)) .unwrap_or(Some(bin_start)) { - let bin_start = ((bin as f64) * bin_size).ceil() as i32; - let bin_end = (((bin + 1) as f64) * bin_size).ceil() as i32; + let bin_start = ((bin as f64) * bin_size) as i32; + let bin_end = (((bin + 1) as f64) * bin_size) as i32; bin_data.push_back(( bin, diff --git a/pybigtools/tests/test_api.py b/pybigtools/tests/test_api.py index b3ee065..78e323b 100644 --- a/pybigtools/tests/test_api.py +++ b/pybigtools/tests/test_api.py @@ -323,3 +323,13 @@ def test_values(bw, bb): assert math.isnan(ret_arr[0]) assert ret_arr[19] == 0.0 assert np.array_equal(arr, ret_arr, equal_nan=True) + + # Some differences in estimates between pybigtools to other libs + vals = bw.values("chr17", 85525, 85730, bins=2, exact=False) + assert list(vals) == [0.15392776003070907, 2.728891665264241] + vals = bw.values("chr17", 85525, 85730, bins=2, exact=True) + assert list(vals) == [0.06770934917680595, 2.4864424403431347] + vals = bw.values("chr17", 59900, 60105, bins=2, exact=False) + assert list(vals) == [0.5358060553962108, 0.5513471488813751] + vals = bw.values("chr17", 59900, 60105, bins=2, exact=True) + assert list(vals) == [0.5362001863472602, 0.5527710799959679] From ad7c1b75156453244fb915deefdfc6cd5934d09e Mon Sep 17 00:00:00 2001 From: Jack Huey <31162821+jackh726@users.noreply.github.com> Date: Thu, 27 Jun 2024 11:45:08 -0400 Subject: [PATCH 29/30] Before summing base values, max with 0 to ignore nan --- pybigtools/src/lib.rs | 14 +++++++++----- pybigtools/tests/test_api.py | 8 ++++++++ 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/pybigtools/src/lib.rs b/pybigtools/src/lib.rs index 4ee84f8..7659f38 100644 --- a/pybigtools/src/lib.rs +++ b/pybigtools/src/lib.rs @@ -766,7 +766,8 @@ fn to_entry_array_bins>>( .iter() .any(|v| *v > 0) .then(|| front.3.into_iter().sum::() as f64) - .map(|c| front.4.into_iter().sum::() / c) + // Map nan to 0 so the sum is non-nan + .map(|c| front.4.into_iter().map(|c| c.max(0.0)).sum::() / c) .unwrap_or(missing); } } @@ -841,7 +842,8 @@ fn to_entry_array_bins>>( .iter() .any(|v| *v > 0) .then(|| front.3.into_iter().sum::() as f64) - .map(|c| front.4.into_iter().sum::() / c) + // Map nan to 0 so the sum is non-nan + .map(|c| front.4.into_iter().map(|c| c.max(0.0)).sum::() / c) .unwrap_or(missing); } } @@ -898,7 +900,8 @@ fn to_entry_array_zoom>>( .iter() .any(|v| *v > 0) .then(|| front.3.into_iter().sum::() as f64) - .map(|c| front.4.into_iter().sum::() / c) + // Map nan to 0 so the sum is non-nan + .map(|c| front.4.into_iter().map(|c| c.max(0.0)).sum::() / c) .unwrap_or(missing); } } @@ -938,7 +941,7 @@ fn to_entry_array_zoom>>( let overlap_start = (*bin_start).max(interval_start); let overlap_end = (*bin_end).min(interval_end); - let mean = interval.summary.sum / (interval.end - interval.start) as f64; + let mean = interval.summary.sum / (interval.summary.bases_covered) as f64; let range = ((overlap_start - *bin_start) as usize)..((overlap_end - *bin_start) as usize); for i in &mut data[range.clone()] { @@ -979,7 +982,8 @@ fn to_entry_array_zoom>>( .iter() .any(|v| *v > 0) .then(|| front.3.into_iter().sum::() as f64) - .map(|c| front.4.into_iter().sum::() / c) + // Map nan to 0 so the sum is non-nan + .map(|c| front.4.into_iter().map(|c| c.max(0.0)).sum::() / c) .unwrap_or(missing); } } diff --git a/pybigtools/tests/test_api.py b/pybigtools/tests/test_api.py index 78e323b..5a67ba1 100644 --- a/pybigtools/tests/test_api.py +++ b/pybigtools/tests/test_api.py @@ -325,6 +325,9 @@ def test_values(bw, bb): assert np.array_equal(arr, ret_arr, equal_nan=True) # Some differences in estimates between pybigtools to other libs + # Namely, bigtools calculates estimates by taking the + # sum of nanmeans over covered bases (summary.sum/summary.covered_bases) and dividing by covered bases (overlap between zooom and bin) + # So, including these as cases where the calculated value is different vals = bw.values("chr17", 85525, 85730, bins=2, exact=False) assert list(vals) == [0.15392776003070907, 2.728891665264241] vals = bw.values("chr17", 85525, 85730, bins=2, exact=True) @@ -333,3 +336,8 @@ def test_values(bw, bb): assert list(vals) == [0.5358060553962108, 0.5513471488813751] vals = bw.values("chr17", 59900, 60105, bins=2, exact=True) assert list(vals) == [0.5362001863472602, 0.5527710799959679] + + vals = bb.values("chr21", 14_760_000, 14_800_000, bins=1, exact=False) + assert list(vals) == [1.2572170068028603] + vals = bb.values("chr21", 14_760_000, 14_800_000, bins=1, exact=True) + assert list(vals) == [1.3408662900188324] From fc83f8203162bb55056a0f37854e98666a5a9a3f Mon Sep 17 00:00:00 2001 From: Jack Huey <31162821+jackh726@users.noreply.github.com> Date: Thu, 27 Jun 2024 11:57:13 -0400 Subject: [PATCH 30/30] __exit__ should not return Self (it's truthy and prevents exceptions from being raised) --- pybigtools/src/lib.rs | 3 +-- pybigtools/tests/test_api.py | 6 ++++++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/pybigtools/src/lib.rs b/pybigtools/src/lib.rs index 7659f38..b965531 100644 --- a/pybigtools/src/lib.rs +++ b/pybigtools/src/lib.rs @@ -1682,9 +1682,8 @@ impl BBIRead { _exc_type: PyObject, _exc_value: PyObject, _exc_traceback: PyObject, - ) -> Py { + ) { slf.borrow_mut(py).bbi = BBIReadRaw::Closed; - slf } fn __traverse__(&self, visit: PyVisit<'_>) -> Result<(), PyTraverseError> { diff --git a/pybigtools/tests/test_api.py b/pybigtools/tests/test_api.py index 5a67ba1..3ba26b6 100644 --- a/pybigtools/tests/test_api.py +++ b/pybigtools/tests/test_api.py @@ -60,6 +60,12 @@ def test_open_filelike(): with pybigtools.open(f, "r") as b: assert b.chroms() == {'chr21': 48_129_895} +def test_contextmanager_exception(): + class RaisesException(Exception): + pass + with pytest.raises(RaisesException): + with pybigtools.open(REPO_ROOT / "bigtools/resources/test/valid.bigWig", "r"): + raise RaisesException() @pytest.fixture def bw():