From 452f0d7cf1a2a2371b8f7b7c574af10f13ea2460 Mon Sep 17 00:00:00 2001 From: Jack Huey <31162821+jackh726@users.noreply.github.com> Date: Wed, 17 Apr 2024 23:29:40 -0400 Subject: [PATCH 01/31] WIP Refactor ChromData --- bigtools/src/bbi/bbiwrite.rs | 35 ++- bigtools/src/bbi/bedchromdata.rs | 49 ++++- bigtools/src/bbi/bigwigwrite.rs | 354 ++++++++++++++++++++++++++----- 3 files changed, 380 insertions(+), 58 deletions(-) diff --git a/bigtools/src/bbi/bbiwrite.rs b/bigtools/src/bbi/bbiwrite.rs index b38a1e7..e155031 100644 --- a/bigtools/src/bbi/bbiwrite.rs +++ b/bigtools/src/bbi/bbiwrite.rs @@ -26,9 +26,9 @@ use crate::utils::tempfilebuffer::{TempFileBuffer, TempFileBufferWriter}; use crate::bbi::{Summary, ZoomHeader, ZoomRecord, CHROM_TREE_MAGIC, CIR_TREE_MAGIC}; pub(crate) struct ZoomInfo { - resolution: u32, - data: TempFileBuffer, - sections: Flatten>>, + pub(crate) resolution: u32, + pub(crate) data: TempFileBuffer, + pub(crate) sections: Flatten>>, } #[derive(Debug)] @@ -555,6 +555,8 @@ pub enum ChromDataState { /// An opaque key to indicate an processing chromosome pub struct ChromProcessingKey(pub(crate) u32); +pub struct ChromProcessedData(pub(crate) Summary); + /// Effectively like an Iterator of chromosome data pub trait ChromData: Sized { type Values: ChromValues; @@ -579,8 +581,31 @@ pub trait ChromData: Sized { >; } +pub trait ChromData2: Sized { + type Values: ChromValues; + + fn process_to_bbi< + Fut: Future< + Output = Result< + ChromProcessedData, + ProcessChromError<::Error>, + >, + >, + StartProcessing: FnMut( + String, + Self::Values, + ) -> Result::Error>>, + Advance: FnMut(ChromProcessedData), + >( + &mut self, + runtime: &Handle, + start_processing: &mut StartProcessing, + advance: &mut Advance, + ) -> Result<(), ProcessChromError<::Error>>; +} + // Zooms have to be double-buffered: first because chroms could be processed in parallel and second because we don't know the offset of each zoom immediately -type ZoomValue = ( +pub(crate) type ZoomValue = ( Vec>, TempFileBuffer, Option>, @@ -597,7 +622,7 @@ type DataWithoutzooms = ( futures::future::RemoteHandle>>, ); -async fn write_chroms_with_zooms( +pub(crate) async fn write_chroms_with_zooms( mut file: BufWriter, mut zooms_map: BTreeMap, mut receiver: futures_mpsc::UnboundedReceiver>, diff --git a/bigtools/src/bbi/bedchromdata.rs b/bigtools/src/bbi/bedchromdata.rs index 78701ef..a77613e 100644 --- a/bigtools/src/bbi/bedchromdata.rs +++ b/bigtools/src/bbi/bedchromdata.rs @@ -12,12 +12,14 @@ use std::fs::File; use std::io::{BufReader, Seek, SeekFrom}; use std::path::PathBuf; +use tokio::runtime::Handle; + use crate::bed::bedparser::{ BedChromData, BedFileStream, BedParser, BedValueError, Parser, StateValue, StreamingBedValues, }; use crate::utils::chromvalues::ChromValues; use crate::utils::streaming_linereader::StreamingLineReader; -use crate::{ChromData, ChromDataState, ChromProcessingKey, ProcessChromError}; +use crate::{ChromData, ChromData2, ChromDataState, ChromProcessingKey, ProcessChromError}; pub struct BedParserStreamingIterator { bed_data: BedParser, @@ -72,6 +74,51 @@ impl ChromData for BedParserStreamingIterator { } } +impl ChromData2 for BedParserStreamingIterator { + type Values = BedChromData; + + fn process_to_bbi< + Fut: futures::prelude::Future< + Output = Result< + crate::ChromProcessedData, + ProcessChromError<::Error>, + >, + >, + StartProcessing: FnMut( + String, + Self::Values, + ) -> Result::Error>>, + Advance: FnMut(crate::ChromProcessedData), + >( + &mut self, + runtime: &Handle, + start_processing: &mut StartProcessing, + advance: &mut Advance, + ) -> Result<(), ProcessChromError<::Error>> { + loop { + match self.bed_data.next_chrom() { + Some(Ok((chrom, group))) => { + // First, if we don't want to allow out of order chroms, error here + let last = self.last_chrom.replace(chrom.clone()); + if let Some(c) = last { + // TODO: test this correctly fails + if !self.allow_out_of_order_chroms && c >= chrom { + return Err(ProcessChromError::SourceError(BedValueError::InvalidInput("Input bedGraph not sorted by chromosome. Sort with `sort -k1,1 -k2,2n`.".to_string()))); + } + } + + let read = start_processing(chrom, group)?; + let data = runtime.block_on(read)?; + advance(data); + } + Some(Err(e)) => return Err(ProcessChromError::SourceError(e)), + None => break, + } + } + + Ok(()) + } +} pub struct BedParserParallelStreamingIterator { allow_out_of_order_chroms: bool, last_chrom: Option, diff --git a/bigtools/src/bbi/bigwigwrite.rs b/bigtools/src/bbi/bigwigwrite.rs index b615762..6e9bc69 100644 --- a/bigtools/src/bbi/bigwigwrite.rs +++ b/bigtools/src/bbi/bigwigwrite.rs @@ -40,12 +40,14 @@ out.write(chrom_map, vals, runtime)?; # } ``` */ -use std::collections::HashMap; +use std::collections::{BTreeMap, HashMap}; use std::error::Error; use std::fs::File; use std::future::Future; use std::io::{self, BufWriter, Write}; +use std::vec; +use futures::channel::mpsc as futures_mpsc; use futures::future::FutureExt; use futures::sink::SinkExt; @@ -53,8 +55,14 @@ use byteorder::{NativeEndian, WriteBytesExt}; use tokio::runtime::{Handle, Runtime}; use crate::utils::chromvalues::ChromValues; +use crate::utils::idmap::IdMap; use crate::utils::tell::Tell; -use crate::{write_info, ChromData, ChromProcessingInputSectionChannel}; +use crate::utils::tempfilebuffer::{TempFileBuffer, TempFileBufferWriter}; +use crate::{ + future_channel, write_chroms_with_zooms, write_info, ChromData, ChromData2, ChromDataState, + ChromProcessedData, ChromProcessingInputSectionChannel, ChromProcessingKey, Section, + TempZoomInfo, ZoomInfo, ZoomValue, +}; use crate::bbi::{Summary, Value, ZoomRecord, BIGWIG_MAGIC}; use crate::bbiwrite::{ @@ -115,81 +123,323 @@ impl BigWigWrite { >( self, chrom_sizes: HashMap, - vals: V, + mut vals: V, runtime: Runtime, ) -> Result<(), ProcessChromError> { - let process_chrom = |zooms_channels: Vec<(u32, ChromProcessingInputSectionChannel)>, - ftx: ChromProcessingInputSectionChannel, - chrom_id: u32, - options: BBIWriteOptions, - runtime: Handle, - chrom_values: Values, - chrom: String, - chrom_length: u32| { + let options = self.options; + let fp = File::create(self.path.clone())?; + let mut file = BufWriter::new(fp); + + let (total_summary_offset, full_data_offset, pre_data) = BigWigWrite::write_pre(&mut file)?; + + let zooms_map: BTreeMap = + std::iter::successors(Some(options.initial_zoom_size), |z| Some(z * 4)) + .take(options.max_zooms as usize) + .map(|size| { + let section_iter = vec![]; + let (buf, write): (TempFileBuffer, TempFileBufferWriter) = + TempFileBuffer::new(options.inmemory); + let value = (section_iter, buf, Some(write)); + (size, value) + }) + .collect(); + + let mut chrom_ids = IdMap::default(); + + let mut key = 0; + let mut output: BTreeMap = BTreeMap::new(); + + let mut summary: Option = None; + let (send, recv) = futures_mpsc::unbounded(); + let write_fut = write_chroms_with_zooms(file, zooms_map, recv); + + let setup_chrom = || { + let (ftx, sections_handle, buf, section_receiver) = + future_channel(options.channel_size, runtime.handle(), options.inmemory); + + let (zoom_infos, zooms_channels) = { + let mut zoom_infos = Vec::with_capacity(options.max_zooms as usize); + let mut zooms_channels = Vec::with_capacity(options.max_zooms as usize); + + let zoom_sizes = + std::iter::successors(Some(options.initial_zoom_size), |z| Some(z * 4)) + .take(options.max_zooms as usize); + for size in zoom_sizes { + let (ftx, handle, buf, section_receiver) = + future_channel(options.channel_size, runtime.handle(), options.inmemory); + let zoom_info = TempZoomInfo { + resolution: size, + data_write_future: Box::new(handle), + data: buf, + sections: section_receiver, + }; + zoom_infos.push(zoom_info); + zooms_channels.push((size, ftx)); + } + (zoom_infos, zooms_channels) + }; + + match send.unbounded_send((section_receiver, buf, sections_handle, zoom_infos)) { + Ok(_) => {} + Err(_) => panic!("Expected to always send."), + } + + (zooms_channels, ftx) + }; + let mut do_read = |chrom: String, + data: _, + output: &mut BTreeMap| + -> Result> { + let length = match chrom_sizes.get(&chrom) { + Some(length) => *length, + None => { + return Err(ProcessChromError::InvalidChromosome(format!( + "Input bedGraph contains chromosome that isn't in the input chrom sizes: {}", + chrom + ))); + } + }; + // Make a new id for the chromosome + let chrom_id = chrom_ids.get_id(&chrom); + + let (zooms_channels, ftx) = setup_chrom(); + let fut = BigWigWrite::process_chrom( zooms_channels, ftx, chrom_id, options, - runtime.clone(), - chrom_values, + runtime.handle().clone(), + data, chrom, - chrom_length, + length, ); - runtime.spawn(fut).map(|f| f.unwrap()) + let fut = runtime.spawn(fut).map(|f| f.unwrap()); + + let curr_key = key; + key += 1; + + output.insert(curr_key, fut); + + Ok(ChromProcessingKey(curr_key)) }; - self.write_internal(chrom_sizes, vals, runtime, process_chrom) + + let (write_fut, write_fut_handle) = write_fut.remote_handle(); + runtime.spawn(write_fut); + loop { + match vals.advance(&mut do_read, &mut output)? { + ChromDataState::NewChrom(read) => { + let fut = output.remove(&read.0).unwrap(); + let chrom_summary = runtime.block_on(fut)?; + match &mut summary { + None => summary = Some(chrom_summary), + Some(summary) => { + summary.total_items += chrom_summary.total_items; + summary.bases_covered += chrom_summary.bases_covered; + summary.min_val = summary.min_val.min(chrom_summary.min_val); + summary.max_val = summary.max_val.max(chrom_summary.max_val); + summary.sum += chrom_summary.sum; + summary.sum_squares += chrom_summary.sum_squares; + } + } + } + ChromDataState::Finished => break, + ChromDataState::Error(err) => return Err(ProcessChromError::SourceError(err)), + } + } + drop(send); + + self.write_internal_post( + summary, + runtime, + write_fut_handle, + chrom_ids, + pre_data, + chrom_sizes, + full_data_offset, + total_summary_offset, + ) } /// Write the values from `V` as a bigWig. Will utilize the provided runtime for encoding values, but will read through values on the current thread. pub fn write_singlethreaded< Values: ChromValues, - V: ChromData, + V: ChromData2, >( self, chrom_sizes: HashMap, - vals: V, + mut vals: V, runtime: Runtime, ) -> Result<(), ProcessChromError> { - self.write_internal(chrom_sizes, vals, runtime, BigWigWrite::process_chrom) - } - - fn write_internal< - Values: ChromValues, - V: ChromData, - Fut: Future>>, - G: Fn( - Vec<(u32, ChromProcessingInputSectionChannel)>, - ChromProcessingInputSectionChannel, - u32, - BBIWriteOptions, - Handle, - Values, - String, - u32, - ) -> Fut, - >( - self, - chrom_sizes: HashMap, - vals: V, - runtime: Runtime, - process_chrom: G, - ) -> Result<(), ProcessChromError> { + let options = self.options; let fp = File::create(self.path.clone())?; let mut file = BufWriter::new(fp); let (total_summary_offset, full_data_offset, pre_data) = BigWigWrite::write_pre(&mut file)?; - let output = bbiwrite::write_vals( - vals, - file, - self.options, - process_chrom, + let zooms_map: BTreeMap = + std::iter::successors(Some(options.initial_zoom_size), |z| Some(z * 4)) + .take(options.max_zooms as usize) + .map(|size| { + let section_iter = vec![]; + let (buf, write): (TempFileBuffer, TempFileBufferWriter) = + TempFileBuffer::new(options.inmemory); + let value = (section_iter, buf, Some(write)); + (size, value) + }) + .collect(); + + let mut chrom_ids = IdMap::default(); + + let mut summary: Option = None; + let (send, recv) = futures_mpsc::unbounded(); + let write_fut = write_chroms_with_zooms(file, zooms_map, recv); + let (write_fut, write_fut_handle) = write_fut.remote_handle(); + runtime.spawn(write_fut); + + let handle = runtime.handle(); + + let setup_chrom = || { + let (ftx, sections_handle, buf, section_receiver) = + future_channel(options.channel_size, runtime.handle(), options.inmemory); + + let (zoom_infos, zooms_channels) = { + let mut zoom_infos = Vec::with_capacity(options.max_zooms as usize); + let mut zooms_channels = Vec::with_capacity(options.max_zooms as usize); + + let zoom_sizes = + std::iter::successors(Some(options.initial_zoom_size), |z| Some(z * 4)) + .take(options.max_zooms as usize); + for size in zoom_sizes { + let (ftx, handle, buf, section_receiver) = + future_channel(options.channel_size, runtime.handle(), options.inmemory); + let zoom_info = TempZoomInfo { + resolution: size, + data_write_future: Box::new(handle), + data: buf, + sections: section_receiver, + }; + zoom_infos.push(zoom_info); + zooms_channels.push((size, ftx)); + } + (zoom_infos, zooms_channels) + }; + + match send.unbounded_send((section_receiver, buf, sections_handle, zoom_infos)) { + Ok(_) => {} + Err(_) => panic!("Expected to always send."), + } + + (zooms_channels, ftx) + }; + let mut do_read = |chrom: String, data: _| -> Result<_, ProcessChromError<_>> { + let length = match chrom_sizes.get(&chrom) { + Some(length) => *length, + None => { + return Err(ProcessChromError::InvalidChromosome(format!( + "Input bedGraph contains chromosome that isn't in the input chrom sizes: {}", + chrom + ))); + } + }; + // Make a new id for the chromosome + let chrom_id = chrom_ids.get_id(&chrom); + + let (zooms_channels, ftx) = setup_chrom(); + + let fut = BigWigWrite::process_chrom( + zooms_channels, + ftx, + chrom_id, + options, + handle.clone(), + data, + chrom, + length, + ); + + let fut = fut.map(|f| f.map(|s| ChromProcessedData(s))); + + Ok(fut) + }; + + let mut advance = |data: ChromProcessedData| { + let ChromProcessedData(chrom_summary) = data; + match &mut summary { + None => summary = Some(chrom_summary), + Some(summary) => { + summary.total_items += chrom_summary.total_items; + summary.bases_covered += chrom_summary.bases_covered; + summary.min_val = summary.min_val.min(chrom_summary.min_val); + summary.max_val = summary.max_val.max(chrom_summary.max_val); + summary.sum += chrom_summary.sum; + summary.sum_squares += chrom_summary.sum_squares; + } + } + }; + + vals.process_to_bbi(handle, &mut do_read, &mut advance)?; + + drop(send); + + self.write_internal_post( + summary, runtime, - chrom_sizes.clone(), - ); - let (chrom_ids, summary, mut file, raw_sections_iter, zoom_infos, uncompress_buf_size) = - output?; + write_fut_handle, + chrom_ids, + pre_data, + chrom_sizes, + full_data_offset, + total_summary_offset, + ) + } + + fn write_internal_post( + self, + summary: Option, + runtime: Runtime, + write_fut_handle: impl Future< + Output = Result< + ( + BufWriter, + usize, + Vec>, + BTreeMap, + ), + ProcessChromError, + >, + >, + chrom_ids: IdMap, + pre_data: u64, + chrom_sizes: HashMap, + full_data_offset: u64, + total_summary_offset: u64, + ) -> Result<(), ProcessChromError> { + let summary = summary.unwrap_or(Summary { + total_items: 0, + bases_covered: 0, + min_val: 0.0, + max_val: 0.0, + sum: 0.0, + sum_squares: 0.0, + }); + + let (mut file, max_uncompressed_buf_size, section_iter, zooms_map) = + runtime.block_on(write_fut_handle)?; + + let zoom_infos: Vec = zooms_map + .into_iter() + .map(|(size, zoom)| { + drop(zoom.2); + let sections = zoom.0.into_iter().flatten(); + ZoomInfo { + resolution: size, + data: zoom.1, + sections, + } + }) + .collect(); + let raw_sections_iter = section_iter.into_iter().flatten(); let chrom_ids = chrom_ids.get_map(); let (data_size, chrom_index_start, index_start, total_sections) = bbiwrite::write_mid( @@ -215,7 +465,7 @@ impl BigWigWrite { 0, 0, total_summary_offset, - uncompress_buf_size, + max_uncompressed_buf_size, zoom_entries, summary, total_sections, From fd8ba7fb47429d0191850c5ed562c997fd4b1fa0 Mon Sep 17 00:00:00 2001 From: Jack Huey <31162821+jackh726@users.noreply.github.com> Date: Thu, 18 Apr 2024 22:52:36 -0400 Subject: [PATCH 02/31] WIP convert StreamingIterator to new ChromData --- bigtools/src/bbi/bedchromdata.rs | 78 ++++++++++++++++++++++++++++++++ bigtools/src/bbi/bigwigwrite.rs | 60 ++++++++++-------------- 2 files changed, 102 insertions(+), 36 deletions(-) diff --git a/bigtools/src/bbi/bedchromdata.rs b/bigtools/src/bbi/bedchromdata.rs index a77613e..a2639ce 100644 --- a/bigtools/src/bbi/bedchromdata.rs +++ b/bigtools/src/bbi/bedchromdata.rs @@ -9,6 +9,7 @@ use std::collections::VecDeque; use std::fs::File; +use std::future::Future; use std::io::{BufReader, Seek, SeekFrom}; use std::path::PathBuf; @@ -119,6 +120,7 @@ impl ChromData2 for BedParserStreamingIterator { Ok(()) } } + pub struct BedParserParallelStreamingIterator { allow_out_of_order_chroms: bool, last_chrom: Option, @@ -224,6 +226,82 @@ impl ChromData } } + +impl ChromData2 + for BedParserParallelStreamingIterator, BedValueError> { + type Values = BedChromData>>; + + fn process_to_bbi< + Fut: Future< + Output = Result< + crate::ChromProcessedData, + ProcessChromError<::Error>, + >, + >, + StartProcessing: FnMut( + String, + Self::Values, + ) -> Result::Error>>, + Advance: FnMut(crate::ChromProcessedData), + >( + &mut self, + runtime: &Handle, + start_processing: &mut StartProcessing, + advance: &mut Advance, + ) -> Result<(), ProcessChromError<::Error>> { + let mut remaining = true; + let mut queued_reads: VecDeque<_> = VecDeque::new(); + loop { + while remaining && queued_reads.len() < (4 + 1) { + let curr = match self.chrom_indices.pop() { + Some(c) => c, + None => { + remaining = false; + break; + } + }; + + let mut file = match File::open(&self.path) { + Ok(f) => f, + Err(err) => return Err(ProcessChromError::SourceError(err.into())), + }; + file.seek(SeekFrom::Start(curr.0))?; + let mut parser = BedParser::new(BedFileStream { + bed: StreamingLineReader::new(BufReader::new(file)), + parse: self.parse_fn, + }); + + match parser.next_chrom() { + Some(Ok((chrom, group))) => { + let last = self.last_chrom.replace(chrom.clone()); + if let Some(c) = last { + // TODO: test this correctly fails + if !self.allow_out_of_order_chroms && c >= chrom { + return Err(ProcessChromError::SourceError(BedValueError::InvalidInput("Input bedGraph not sorted by chromosome. Sort with `sort -k1,1 -k2,2n`.".to_string()))); + } + } + + let read = start_processing(chrom, group)?; + let data = runtime.spawn(read); + queued_reads.push_back(data); + } + Some(Err(e)) => return Err(ProcessChromError::SourceError(e)), + None => { + panic!("Unexpected end of file") + } + } + } + let Some(next_chrom) = queued_reads.pop_front() else { + break + }; + let data = runtime.block_on(next_chrom).unwrap()?; + advance(data); + } + + Ok(()) + } +} + impl ChromValues for BedChromData { type Value = S::Value; type Error = BedValueError; diff --git a/bigtools/src/bbi/bigwigwrite.rs b/bigtools/src/bbi/bigwigwrite.rs index 6e9bc69..f38633e 100644 --- a/bigtools/src/bbi/bigwigwrite.rs +++ b/bigtools/src/bbi/bigwigwrite.rs @@ -119,7 +119,7 @@ impl BigWigWrite { /// Write the values from `V` as a bigWig. Will utilize the provided runtime for encoding values and for reading through the values (potentially parallelized by chromosome). pub fn write< Values: ChromValues + Send + 'static, - V: ChromData, + V: ChromData2, >( self, chrom_sizes: HashMap, @@ -146,12 +146,13 @@ impl BigWigWrite { let mut chrom_ids = IdMap::default(); - let mut key = 0; - let mut output: BTreeMap = BTreeMap::new(); - let mut summary: Option = None; let (send, recv) = futures_mpsc::unbounded(); let write_fut = write_chroms_with_zooms(file, zooms_map, recv); + let (write_fut, write_fut_handle) = write_fut.remote_handle(); + runtime.spawn(write_fut); + + let handle = runtime.handle(); let setup_chrom = || { let (ftx, sections_handle, buf, section_receiver) = @@ -186,10 +187,7 @@ impl BigWigWrite { (zooms_channels, ftx) }; - let mut do_read = |chrom: String, - data: _, - output: &mut BTreeMap| - -> Result> { + let mut do_read = |chrom: String, data: _| -> Result<_, ProcessChromError<_>> { let length = match chrom_sizes.get(&chrom) { Some(length) => *length, None => { @@ -209,44 +207,34 @@ impl BigWigWrite { ftx, chrom_id, options, - runtime.handle().clone(), + handle.clone(), data, chrom, length, ); - let fut = runtime.spawn(fut).map(|f| f.unwrap()); - - let curr_key = key; - key += 1; - output.insert(curr_key, fut); + let fut = fut.map(|f| f.map(|s| ChromProcessedData(s))); - Ok(ChromProcessingKey(curr_key)) + Ok(fut) }; - let (write_fut, write_fut_handle) = write_fut.remote_handle(); - runtime.spawn(write_fut); - loop { - match vals.advance(&mut do_read, &mut output)? { - ChromDataState::NewChrom(read) => { - let fut = output.remove(&read.0).unwrap(); - let chrom_summary = runtime.block_on(fut)?; - match &mut summary { - None => summary = Some(chrom_summary), - Some(summary) => { - summary.total_items += chrom_summary.total_items; - summary.bases_covered += chrom_summary.bases_covered; - summary.min_val = summary.min_val.min(chrom_summary.min_val); - summary.max_val = summary.max_val.max(chrom_summary.max_val); - summary.sum += chrom_summary.sum; - summary.sum_squares += chrom_summary.sum_squares; - } - } + let mut advance = |data: ChromProcessedData| { + let ChromProcessedData(chrom_summary) = data; + match &mut summary { + None => summary = Some(chrom_summary), + Some(summary) => { + summary.total_items += chrom_summary.total_items; + summary.bases_covered += chrom_summary.bases_covered; + summary.min_val = summary.min_val.min(chrom_summary.min_val); + summary.max_val = summary.max_val.max(chrom_summary.max_val); + summary.sum += chrom_summary.sum; + summary.sum_squares += chrom_summary.sum_squares; } - ChromDataState::Finished => break, - ChromDataState::Error(err) => return Err(ProcessChromError::SourceError(err)), } - } + }; + + vals.process_to_bbi(handle, &mut do_read, &mut advance)?; + drop(send); self.write_internal_post( From 85acb06804d325f6fe84a95a3fca15bf80d8025c Mon Sep 17 00:00:00 2001 From: Jack Huey <31162821+jackh726@users.noreply.github.com> Date: Thu, 18 Apr 2024 23:09:10 -0400 Subject: [PATCH 03/31] Create ChromProcess trait --- bigtools/src/bbi/bbiwrite.rs | 33 ++++++++++++--------- bigtools/src/bbi/bedchromdata.rs | 12 ++++---- bigtools/src/bbi/bigbedwrite.rs | 32 +++++++++++++++++++-- bigtools/src/bbi/bigwigwrite.rs | 34 ++++++++++++++++++++-- bigtools/src/utils/cli/bigwigmerge.rs | 41 ++++++++++++++++++++++++++- 5 files changed, 126 insertions(+), 26 deletions(-) diff --git a/bigtools/src/bbi/bbiwrite.rs b/bigtools/src/bbi/bbiwrite.rs index e155031..3fe85d7 100644 --- a/bigtools/src/bbi/bbiwrite.rs +++ b/bigtools/src/bbi/bbiwrite.rs @@ -1,6 +1,7 @@ use std::collections::{BTreeMap, HashMap}; use std::error::Error; use std::fs::File; +use std::future::Future; use std::io::{self, BufWriter, Seek, SeekFrom, Write}; use std::iter::Flatten; use std::pin::Pin; @@ -12,7 +13,7 @@ use thiserror::Error; use futures::channel::mpsc as futures_mpsc; use futures::channel::mpsc::channel; -use futures::future::{Future, FutureExt}; +use futures::future::FutureExt; use futures::stream::StreamExt; use serde::{Deserialize, Serialize}; @@ -718,25 +719,29 @@ async fn write_chroms_without_zooms( Ok((file, max_uncompressed_buf_size, section_iter)) } +pub(crate) trait ChromProcess { + type Value; + async fn do_process>( + zooms_channels: Vec<(u32, ChromProcessingInputSectionChannel)>, + ftx: ChromProcessingInputSectionChannel, + chrom_id: u32, + options: BBIWriteOptions, + runtime: Handle, + data: Values, + chrom: String, + length: u32, + ) -> Result>; +} + pub(crate) fn write_vals< Values: ChromValues, V: ChromData, - Fut: Future>>, - G: Fn( - Vec<(u32, ChromProcessingInputSectionChannel)>, - ChromProcessingInputSectionChannel, - u32, - BBIWriteOptions, - Handle, - Values, - String, - u32, - ) -> Fut, + P: ChromProcess, >( mut vals_iter: V, file: BufWriter, options: BBIWriteOptions, - process_chrom: G, + process_chrom: P, runtime: Runtime, chrom_sizes: HashMap, ) -> Result< @@ -822,7 +827,7 @@ pub(crate) fn write_vals< let (zooms_channels, ftx) = setup_chrom(); - let fut = process_chrom( + let fut = P::do_process( zooms_channels, ftx, chrom_id, diff --git a/bigtools/src/bbi/bedchromdata.rs b/bigtools/src/bbi/bedchromdata.rs index a2639ce..74de4e6 100644 --- a/bigtools/src/bbi/bedchromdata.rs +++ b/bigtools/src/bbi/bedchromdata.rs @@ -226,9 +226,9 @@ impl ChromData } } - impl ChromData2 - for BedParserParallelStreamingIterator, BedValueError> { + for BedParserParallelStreamingIterator, BedValueError> +{ type Values = BedChromData>>; fn process_to_bbi< @@ -260,7 +260,7 @@ impl ChromData2 break; } }; - + let mut file = match File::open(&self.path) { Ok(f) => f, Err(err) => return Err(ProcessChromError::SourceError(err.into())), @@ -270,7 +270,7 @@ impl ChromData2 bed: StreamingLineReader::new(BufReader::new(file)), parse: self.parse_fn, }); - + match parser.next_chrom() { Some(Ok((chrom, group))) => { let last = self.last_chrom.replace(chrom.clone()); @@ -280,7 +280,7 @@ impl ChromData2 return Err(ProcessChromError::SourceError(BedValueError::InvalidInput("Input bedGraph not sorted by chromosome. Sort with `sort -k1,1 -k2,2n`.".to_string()))); } } - + let read = start_processing(chrom, group)?; let data = runtime.spawn(read); queued_reads.push_back(data); @@ -292,7 +292,7 @@ impl ChromData2 } } let Some(next_chrom) = queued_reads.pop_front() else { - break + break; }; let data = runtime.block_on(next_chrom).unwrap()?; advance(data); diff --git a/bigtools/src/bbi/bigbedwrite.rs b/bigtools/src/bbi/bigbedwrite.rs index c038a66..c0973fe 100644 --- a/bigtools/src/bbi/bigbedwrite.rs +++ b/bigtools/src/bbi/bigbedwrite.rs @@ -14,7 +14,7 @@ use tokio::runtime::{Handle, Runtime}; use crate::utils::chromvalues::ChromValues; use crate::utils::indexlist::IndexList; use crate::utils::tell::Tell; -use crate::{write_info, ChromData, ChromProcessingInputSectionChannel}; +use crate::{write_info, ChromData, ChromProcess, ChromProcessingInputSectionChannel}; use crate::bbi::{BedEntry, Summary, Value, ZoomRecord, BIGBED_MAGIC}; use crate::bbiwrite::{ @@ -154,7 +154,7 @@ impl BigBedWrite { vals, file, self.options, - process_chrom, + BigBedFullProcess, runtime, chrom_sizes.clone(), ); @@ -858,6 +858,34 @@ struct EntriesSection { overlap: IndexList, } +pub(crate) struct BigBedFullProcess; + +impl ChromProcess for BigBedFullProcess { + type Value = BedEntry; + async fn do_process>( + zooms_channels: Vec<(u32, ChromProcessingInputSectionChannel)>, + ftx: ChromProcessingInputSectionChannel, + chrom_id: u32, + options: BBIWriteOptions, + runtime: Handle, + data: Values, + chrom: String, + length: u32, + ) -> Result> { + BigBedWrite::process_chrom( + zooms_channels, + ftx, + chrom_id, + options, + runtime, + data, + chrom, + length, + ) + .await + } +} + async fn encode_section( compress: bool, items_in_section: Vec, diff --git a/bigtools/src/bbi/bigwigwrite.rs b/bigtools/src/bbi/bigwigwrite.rs index f38633e..92d986c 100644 --- a/bigtools/src/bbi/bigwigwrite.rs +++ b/bigtools/src/bbi/bigwigwrite.rs @@ -59,9 +59,9 @@ use crate::utils::idmap::IdMap; use crate::utils::tell::Tell; use crate::utils::tempfilebuffer::{TempFileBuffer, TempFileBufferWriter}; use crate::{ - future_channel, write_chroms_with_zooms, write_info, ChromData, ChromData2, ChromDataState, - ChromProcessedData, ChromProcessingInputSectionChannel, ChromProcessingKey, Section, - TempZoomInfo, ZoomInfo, ZoomValue, + future_channel, write_chroms_with_zooms, write_info, ChromData, ChromData2, ChromProcess, + ChromProcessedData, ChromProcessingInputSectionChannel, Section, TempZoomInfo, ZoomInfo, + ZoomValue, }; use crate::bbi::{Summary, Value, ZoomRecord, BIGWIG_MAGIC}; @@ -909,6 +909,34 @@ impl BigWigWrite { } } +pub(crate) struct BigWigFullProcess; + +impl ChromProcess for BigWigFullProcess { + type Value = Value; + async fn do_process>( + zooms_channels: Vec<(u32, ChromProcessingInputSectionChannel)>, + ftx: ChromProcessingInputSectionChannel, + chrom_id: u32, + options: BBIWriteOptions, + runtime: Handle, + data: Values, + chrom: String, + length: u32, + ) -> Result> { + BigWigWrite::process_chrom( + zooms_channels, + ftx, + chrom_id, + options, + runtime, + data, + chrom, + length, + ) + .await + } +} + async fn encode_section( compress: bool, items_in_section: Vec, diff --git a/bigtools/src/utils/cli/bigwigmerge.rs b/bigtools/src/utils/cli/bigwigmerge.rs index 353ee22..02fc8fd 100644 --- a/bigtools/src/utils/cli/bigwigmerge.rs +++ b/bigtools/src/utils/cli/bigwigmerge.rs @@ -1,6 +1,7 @@ use std::collections::{BTreeMap, HashMap}; use std::error::Error; use std::fs::File; +use std::future::Future; use std::io::{self, BufRead, BufReader}; use clap::Parser; @@ -10,9 +11,9 @@ use thiserror::Error; use crate::utils::chromvalues::ChromValues; use crate::utils::merge::merge_sections_many; use crate::utils::reopen::ReopenableFile; -use crate::Value; use crate::{BBIReadError, BigWigRead, BigWigWrite}; use crate::{ChromData, ChromDataState, ChromProcessingKey, ProcessChromError}; +use crate::{ChromData2, Value}; use tokio::runtime; use super::BBIWriteArgs; @@ -390,3 +391,41 @@ impl ChromData for ChromGroupReadImpl { }) } } + +impl ChromData2 for ChromGroupReadImpl { + type Values = MergingValues; + fn process_to_bbi< + Fut: Future< + Output = Result< + crate::ChromProcessedData, + ProcessChromError<::Error>, + >, + >, + StartProcessing: FnMut( + String, + Self::Values, + ) -> Result::Error>>, + Advance: FnMut(crate::ChromProcessedData), + >( + &mut self, + runtime: &runtime::Handle, + start_processing: &mut StartProcessing, + advance: &mut Advance, + ) -> Result<(), ProcessChromError<::Error>> { + loop { + let next: Option> = + self.iter.next(); + match next { + Some(Ok((chrom, _, group))) => { + let read = start_processing(chrom, group)?; + let data = runtime.block_on(read)?; + advance(data); + } + Some(Err(e)) => return Err(ProcessChromError::SourceError(e)), + None => break, + } + } + + Ok(()) + } +} From 3d7a802ee489b5552142f474b08bd7a9f6b36871 Mon Sep 17 00:00:00 2001 From: Jack Huey <31162821+jackh726@users.noreply.github.com> Date: Thu, 18 Apr 2024 23:36:12 -0400 Subject: [PATCH 04/31] Pass trait to new ChromData --- bigtools/src/bbi/bbiwrite.rs | 39 +++++++++-------- bigtools/src/bbi/bedchromdata.rs | 38 ++++++++-------- bigtools/src/bbi/bigbedwrite.rs | 35 +++++++++------ bigtools/src/bbi/bigwigwrite.rs | 62 +++++++++++++-------------- bigtools/src/utils/cli/bigwigmerge.rs | 19 ++++---- 5 files changed, 96 insertions(+), 97 deletions(-) diff --git a/bigtools/src/bbi/bbiwrite.rs b/bigtools/src/bbi/bbiwrite.rs index 3fe85d7..db9984c 100644 --- a/bigtools/src/bbi/bbiwrite.rs +++ b/bigtools/src/bbi/bbiwrite.rs @@ -586,16 +586,13 @@ pub trait ChromData2: Sized { type Values: ChromValues; fn process_to_bbi< - Fut: Future< - Output = Result< - ChromProcessedData, - ProcessChromError<::Error>, - >, - >, + P: ChromProcess::Value>, StartProcessing: FnMut( String, - Self::Values, - ) -> Result::Error>>, + ) -> Result< + InternalProcessData, + ProcessChromError<::Error>, + >, Advance: FnMut(ChromProcessedData), >( &mut self, @@ -719,18 +716,22 @@ async fn write_chroms_without_zooms( Ok((file, max_uncompressed_buf_size, section_iter)) } +pub struct InternalProcessData( + pub(crate) Vec<(u32, ChromProcessingInputSectionChannel)>, + pub(crate) ChromProcessingInputSectionChannel, + pub(crate) u32, + pub(crate) BBIWriteOptions, + pub(crate) Handle, + pub(crate) String, + pub(crate) u32, +); + pub(crate) trait ChromProcess { type Value; async fn do_process>( - zooms_channels: Vec<(u32, ChromProcessingInputSectionChannel)>, - ftx: ChromProcessingInputSectionChannel, - chrom_id: u32, - options: BBIWriteOptions, - runtime: Handle, + internal_data: InternalProcessData, data: Values, - chrom: String, - length: u32, - ) -> Result>; + ) -> Result>; } pub(crate) fn write_vals< @@ -827,16 +828,16 @@ pub(crate) fn write_vals< let (zooms_channels, ftx) = setup_chrom(); - let fut = P::do_process( + let internal_data = InternalProcessData( zooms_channels, ftx, chrom_id, options, runtime.handle().clone(), - data, chrom, length, ); + let fut = P::do_process(internal_data, data); let curr_key = key; key += 1; @@ -852,7 +853,7 @@ pub(crate) fn write_vals< match vals_iter.advance(&mut do_read, &mut output)? { ChromDataState::NewChrom(read) => { let fut = output.remove(&read.0).unwrap(); - let chrom_summary = runtime.block_on(fut)?; + let chrom_summary = runtime.block_on(fut)?.0; match &mut summary { None => summary = Some(chrom_summary), Some(summary) => { diff --git a/bigtools/src/bbi/bedchromdata.rs b/bigtools/src/bbi/bedchromdata.rs index 74de4e6..caa1c98 100644 --- a/bigtools/src/bbi/bedchromdata.rs +++ b/bigtools/src/bbi/bedchromdata.rs @@ -9,7 +9,6 @@ use std::collections::VecDeque; use std::fs::File; -use std::future::Future; use std::io::{BufReader, Seek, SeekFrom}; use std::path::PathBuf; @@ -20,7 +19,10 @@ use crate::bed::bedparser::{ }; use crate::utils::chromvalues::ChromValues; use crate::utils::streaming_linereader::StreamingLineReader; -use crate::{ChromData, ChromData2, ChromDataState, ChromProcessingKey, ProcessChromError}; +use crate::{ + ChromData, ChromData2, ChromDataState, ChromProcess, ChromProcessingKey, InternalProcessData, + ProcessChromError, +}; pub struct BedParserStreamingIterator { bed_data: BedParser, @@ -79,16 +81,13 @@ impl ChromData2 for BedParserStreamingIterator { type Values = BedChromData; fn process_to_bbi< - Fut: futures::prelude::Future< - Output = Result< - crate::ChromProcessedData, - ProcessChromError<::Error>, - >, - >, + P: ChromProcess::Value>, StartProcessing: FnMut( String, - Self::Values, - ) -> Result::Error>>, + ) -> Result< + InternalProcessData, + ProcessChromError<::Error>, + >, Advance: FnMut(crate::ChromProcessedData), >( &mut self, @@ -108,7 +107,8 @@ impl ChromData2 for BedParserStreamingIterator { } } - let read = start_processing(chrom, group)?; + let internal_data = start_processing(chrom)?; + let read = P::do_process(internal_data, group); let data = runtime.block_on(read)?; advance(data); } @@ -232,16 +232,13 @@ impl ChromData2 type Values = BedChromData>>; fn process_to_bbi< - Fut: Future< - Output = Result< - crate::ChromProcessedData, - ProcessChromError<::Error>, - >, - >, + P: ChromProcess::Value>, StartProcessing: FnMut( String, - Self::Values, - ) -> Result::Error>>, + ) -> Result< + InternalProcessData, + ProcessChromError<::Error>, + >, Advance: FnMut(crate::ChromProcessedData), >( &mut self, @@ -281,7 +278,8 @@ impl ChromData2 } } - let read = start_processing(chrom, group)?; + let internal_data = start_processing(chrom)?; + let read = P::do_process(internal_data, group); let data = runtime.spawn(read); queued_reads.push_back(data); } diff --git a/bigtools/src/bbi/bigbedwrite.rs b/bigtools/src/bbi/bigbedwrite.rs index c0973fe..3fda89d 100644 --- a/bigtools/src/bbi/bigbedwrite.rs +++ b/bigtools/src/bbi/bigbedwrite.rs @@ -14,7 +14,10 @@ use tokio::runtime::{Handle, Runtime}; use crate::utils::chromvalues::ChromValues; use crate::utils::indexlist::IndexList; use crate::utils::tell::Tell; -use crate::{write_info, ChromData, ChromProcess, ChromProcessingInputSectionChannel}; +use crate::{ + write_info, ChromData, ChromProcess, ChromProcessedData, ChromProcessingInputSectionChannel, + InternalProcessData, +}; use crate::bbi::{BedEntry, Summary, Value, ZoomRecord, BIGBED_MAGIC}; use crate::bbiwrite::{ @@ -863,26 +866,30 @@ pub(crate) struct BigBedFullProcess; impl ChromProcess for BigBedFullProcess { type Value = BedEntry; async fn do_process>( - zooms_channels: Vec<(u32, ChromProcessingInputSectionChannel)>, - ftx: ChromProcessingInputSectionChannel, - chrom_id: u32, - options: BBIWriteOptions, - runtime: Handle, - data: Values, - chrom: String, - length: u32, - ) -> Result> { - BigBedWrite::process_chrom( + InternalProcessData( zooms_channels, ftx, chrom_id, options, runtime, - data, chrom, length, - ) - .await + ): InternalProcessData, + data: Values, + ) -> Result> { + Ok(ChromProcessedData( + BigBedWrite::process_chrom( + zooms_channels, + ftx, + chrom_id, + options, + runtime, + data, + chrom, + length, + ) + .await?, + )) } } diff --git a/bigtools/src/bbi/bigwigwrite.rs b/bigtools/src/bbi/bigwigwrite.rs index 92d986c..283ee3c 100644 --- a/bigtools/src/bbi/bigwigwrite.rs +++ b/bigtools/src/bbi/bigwigwrite.rs @@ -60,8 +60,8 @@ use crate::utils::tell::Tell; use crate::utils::tempfilebuffer::{TempFileBuffer, TempFileBufferWriter}; use crate::{ future_channel, write_chroms_with_zooms, write_info, ChromData, ChromData2, ChromProcess, - ChromProcessedData, ChromProcessingInputSectionChannel, Section, TempZoomInfo, ZoomInfo, - ZoomValue, + ChromProcessedData, ChromProcessingInputSectionChannel, InternalProcessData, Section, + TempZoomInfo, ZoomInfo, ZoomValue, }; use crate::bbi::{Summary, Value, ZoomRecord, BIGWIG_MAGIC}; @@ -187,7 +187,7 @@ impl BigWigWrite { (zooms_channels, ftx) }; - let mut do_read = |chrom: String, data: _| -> Result<_, ProcessChromError<_>> { + let mut do_read = |chrom: String| -> Result<_, ProcessChromError<_>> { let length = match chrom_sizes.get(&chrom) { Some(length) => *length, None => { @@ -202,20 +202,16 @@ impl BigWigWrite { let (zooms_channels, ftx) = setup_chrom(); - let fut = BigWigWrite::process_chrom( + let internal_data = crate::InternalProcessData( zooms_channels, ftx, chrom_id, options, - handle.clone(), - data, + runtime.handle().clone(), chrom, length, ); - - let fut = fut.map(|f| f.map(|s| ChromProcessedData(s))); - - Ok(fut) + Ok(internal_data) }; let mut advance = |data: ChromProcessedData| { @@ -233,7 +229,7 @@ impl BigWigWrite { } }; - vals.process_to_bbi(handle, &mut do_read, &mut advance)?; + vals.process_to_bbi::(handle, &mut do_read, &mut advance)?; drop(send); @@ -320,7 +316,7 @@ impl BigWigWrite { (zooms_channels, ftx) }; - let mut do_read = |chrom: String, data: _| -> Result<_, ProcessChromError<_>> { + let mut do_read = |chrom: String| -> Result<_, ProcessChromError<_>> { let length = match chrom_sizes.get(&chrom) { Some(length) => *length, None => { @@ -335,20 +331,16 @@ impl BigWigWrite { let (zooms_channels, ftx) = setup_chrom(); - let fut = BigWigWrite::process_chrom( + let internal_data = crate::InternalProcessData( zooms_channels, ftx, chrom_id, options, - handle.clone(), - data, + runtime.handle().clone(), chrom, length, ); - - let fut = fut.map(|f| f.map(|s| ChromProcessedData(s))); - - Ok(fut) + Ok(internal_data) }; let mut advance = |data: ChromProcessedData| { @@ -366,7 +358,7 @@ impl BigWigWrite { } }; - vals.process_to_bbi(handle, &mut do_read, &mut advance)?; + vals.process_to_bbi::(handle, &mut do_read, &mut advance)?; drop(send); @@ -914,26 +906,30 @@ pub(crate) struct BigWigFullProcess; impl ChromProcess for BigWigFullProcess { type Value = Value; async fn do_process>( - zooms_channels: Vec<(u32, ChromProcessingInputSectionChannel)>, - ftx: ChromProcessingInputSectionChannel, - chrom_id: u32, - options: BBIWriteOptions, - runtime: Handle, - data: Values, - chrom: String, - length: u32, - ) -> Result> { - BigWigWrite::process_chrom( + InternalProcessData( zooms_channels, ftx, chrom_id, options, runtime, - data, chrom, length, - ) - .await + ): InternalProcessData, + data: Values, + ) -> Result> { + Ok(ChromProcessedData( + BigWigWrite::process_chrom( + zooms_channels, + ftx, + chrom_id, + options, + runtime, + data, + chrom, + length, + ) + .await?, + )) } } diff --git a/bigtools/src/utils/cli/bigwigmerge.rs b/bigtools/src/utils/cli/bigwigmerge.rs index 02fc8fd..ccaa0cc 100644 --- a/bigtools/src/utils/cli/bigwigmerge.rs +++ b/bigtools/src/utils/cli/bigwigmerge.rs @@ -1,7 +1,6 @@ use std::collections::{BTreeMap, HashMap}; use std::error::Error; use std::fs::File; -use std::future::Future; use std::io::{self, BufRead, BufReader}; use clap::Parser; @@ -11,7 +10,7 @@ use thiserror::Error; use crate::utils::chromvalues::ChromValues; use crate::utils::merge::merge_sections_many; use crate::utils::reopen::ReopenableFile; -use crate::{BBIReadError, BigWigRead, BigWigWrite}; +use crate::{BBIReadError, BigWigRead, BigWigWrite, ChromProcess, InternalProcessData}; use crate::{ChromData, ChromDataState, ChromProcessingKey, ProcessChromError}; use crate::{ChromData2, Value}; use tokio::runtime; @@ -395,16 +394,13 @@ impl ChromData for ChromGroupReadImpl { impl ChromData2 for ChromGroupReadImpl { type Values = MergingValues; fn process_to_bbi< - Fut: Future< - Output = Result< - crate::ChromProcessedData, - ProcessChromError<::Error>, - >, - >, + P: ChromProcess::Value>, StartProcessing: FnMut( String, - Self::Values, - ) -> Result::Error>>, + ) -> Result< + InternalProcessData, + ProcessChromError<::Error>, + >, Advance: FnMut(crate::ChromProcessedData), >( &mut self, @@ -417,7 +413,8 @@ impl ChromData2 for ChromGroupReadImpl { self.iter.next(); match next { Some(Ok((chrom, _, group))) => { - let read = start_processing(chrom, group)?; + let internal_data = start_processing(chrom)?; + let read = P::do_process(internal_data, group); let data = runtime.block_on(read)?; advance(data); } From e4126d80165de7df7b802e883590364f5d129c96 Mon Sep 17 00:00:00 2001 From: Jack Huey <31162821+jackh726@users.noreply.github.com> Date: Thu, 18 Apr 2024 23:48:12 -0400 Subject: [PATCH 05/31] Return instance of trait --- bigtools/src/bbi/bbiwrite.rs | 14 +++++--------- bigtools/src/bbi/bedchromdata.rs | 25 +++++++------------------ bigtools/src/bbi/bigbedwrite.rs | 20 ++++++++------------ bigtools/src/bbi/bigwigwrite.rs | 26 ++++++++++++-------------- bigtools/src/utils/cli/bigwigmerge.rs | 13 ++++--------- 5 files changed, 36 insertions(+), 62 deletions(-) diff --git a/bigtools/src/bbi/bbiwrite.rs b/bigtools/src/bbi/bbiwrite.rs index db9984c..7be81cd 100644 --- a/bigtools/src/bbi/bbiwrite.rs +++ b/bigtools/src/bbi/bbiwrite.rs @@ -587,12 +587,7 @@ pub trait ChromData2: Sized { fn process_to_bbi< P: ChromProcess::Value>, - StartProcessing: FnMut( - String, - ) -> Result< - InternalProcessData, - ProcessChromError<::Error>, - >, + StartProcessing: FnMut(String) -> Result::Error>>, Advance: FnMut(ChromProcessedData), >( &mut self, @@ -728,8 +723,9 @@ pub struct InternalProcessData( pub(crate) trait ChromProcess { type Value; + fn create(internal_data: InternalProcessData) -> Self; async fn do_process>( - internal_data: InternalProcessData, + self, data: Values, ) -> Result>; } @@ -742,7 +738,6 @@ pub(crate) fn write_vals< mut vals_iter: V, file: BufWriter, options: BBIWriteOptions, - process_chrom: P, runtime: Runtime, chrom_sizes: HashMap, ) -> Result< @@ -837,7 +832,8 @@ pub(crate) fn write_vals< chrom, length, ); - let fut = P::do_process(internal_data, data); + let mut p = P::create(internal_data); + let fut = p.do_process(data); let curr_key = key; key += 1; diff --git a/bigtools/src/bbi/bedchromdata.rs b/bigtools/src/bbi/bedchromdata.rs index caa1c98..7be771f 100644 --- a/bigtools/src/bbi/bedchromdata.rs +++ b/bigtools/src/bbi/bedchromdata.rs @@ -20,8 +20,7 @@ use crate::bed::bedparser::{ use crate::utils::chromvalues::ChromValues; use crate::utils::streaming_linereader::StreamingLineReader; use crate::{ - ChromData, ChromData2, ChromDataState, ChromProcess, ChromProcessingKey, InternalProcessData, - ProcessChromError, + ChromData, ChromData2, ChromDataState, ChromProcess, ChromProcessingKey, ProcessChromError, }; pub struct BedParserStreamingIterator { @@ -82,12 +81,7 @@ impl ChromData2 for BedParserStreamingIterator { fn process_to_bbi< P: ChromProcess::Value>, - StartProcessing: FnMut( - String, - ) -> Result< - InternalProcessData, - ProcessChromError<::Error>, - >, + StartProcessing: FnMut(String) -> Result::Error>>, Advance: FnMut(crate::ChromProcessedData), >( &mut self, @@ -107,8 +101,8 @@ impl ChromData2 for BedParserStreamingIterator { } } - let internal_data = start_processing(chrom)?; - let read = P::do_process(internal_data, group); + let p = start_processing(chrom)?; + let read = p.do_process(group); let data = runtime.block_on(read)?; advance(data); } @@ -233,12 +227,7 @@ impl ChromData2 fn process_to_bbi< P: ChromProcess::Value>, - StartProcessing: FnMut( - String, - ) -> Result< - InternalProcessData, - ProcessChromError<::Error>, - >, + StartProcessing: FnMut(String) -> Result::Error>>, Advance: FnMut(crate::ChromProcessedData), >( &mut self, @@ -278,8 +267,8 @@ impl ChromData2 } } - let internal_data = start_processing(chrom)?; - let read = P::do_process(internal_data, group); + let p = start_processing(chrom)?; + let read = p.do_process(group); let data = runtime.spawn(read); queued_reads.push_back(data); } diff --git a/bigtools/src/bbi/bigbedwrite.rs b/bigtools/src/bbi/bigbedwrite.rs index 3fda89d..f7cc83a 100644 --- a/bigtools/src/bbi/bigbedwrite.rs +++ b/bigtools/src/bbi/bigbedwrite.rs @@ -153,11 +153,10 @@ impl BigBedWrite { let (autosql_offset, total_summary_offset, full_data_offset, pre_data) = BigBedWrite::write_pre(&mut file, &self.autosql)?; - let output = bbiwrite::write_vals( + let output = bbiwrite::write_vals::<_, _, BigBedFullProcess>( vals, file, self.options, - BigBedFullProcess, runtime, chrom_sizes.clone(), ); @@ -861,22 +860,19 @@ struct EntriesSection { overlap: IndexList, } -pub(crate) struct BigBedFullProcess; +pub(crate) struct BigBedFullProcess(InternalProcessData); impl ChromProcess for BigBedFullProcess { type Value = BedEntry; + fn create(internal_data: InternalProcessData) -> Self { + BigBedFullProcess(internal_data) + } async fn do_process>( - InternalProcessData( - zooms_channels, - ftx, - chrom_id, - options, - runtime, - chrom, - length, - ): InternalProcessData, + self, data: Values, ) -> Result> { + let InternalProcessData(zooms_channels, ftx, chrom_id, options, runtime, chrom, length) = + self.0; Ok(ChromProcessedData( BigBedWrite::process_chrom( zooms_channels, diff --git a/bigtools/src/bbi/bigwigwrite.rs b/bigtools/src/bbi/bigwigwrite.rs index 283ee3c..c488090 100644 --- a/bigtools/src/bbi/bigwigwrite.rs +++ b/bigtools/src/bbi/bigwigwrite.rs @@ -211,7 +211,7 @@ impl BigWigWrite { chrom, length, ); - Ok(internal_data) + Ok(BigWigFullProcess::create(internal_data)) }; let mut advance = |data: ChromProcessedData| { @@ -229,7 +229,7 @@ impl BigWigWrite { } }; - vals.process_to_bbi::(handle, &mut do_read, &mut advance)?; + vals.process_to_bbi(handle, &mut do_read, &mut advance)?; drop(send); @@ -340,7 +340,7 @@ impl BigWigWrite { chrom, length, ); - Ok(internal_data) + Ok(BigWigFullProcess::create(internal_data)) }; let mut advance = |data: ChromProcessedData| { @@ -358,7 +358,7 @@ impl BigWigWrite { } }; - vals.process_to_bbi::(handle, &mut do_read, &mut advance)?; + vals.process_to_bbi(handle, &mut do_read, &mut advance)?; drop(send); @@ -901,22 +901,20 @@ impl BigWigWrite { } } -pub(crate) struct BigWigFullProcess; +pub(crate) struct BigWigFullProcess(InternalProcessData); impl ChromProcess for BigWigFullProcess { type Value = Value; + fn create(internal_data: InternalProcessData) -> Self { + BigWigFullProcess(internal_data) + } + async fn do_process>( - InternalProcessData( - zooms_channels, - ftx, - chrom_id, - options, - runtime, - chrom, - length, - ): InternalProcessData, + self, data: Values, ) -> Result> { + let InternalProcessData(zooms_channels, ftx, chrom_id, options, runtime, chrom, length) = + self.0; Ok(ChromProcessedData( BigWigWrite::process_chrom( zooms_channels, diff --git a/bigtools/src/utils/cli/bigwigmerge.rs b/bigtools/src/utils/cli/bigwigmerge.rs index ccaa0cc..5dcb1ef 100644 --- a/bigtools/src/utils/cli/bigwigmerge.rs +++ b/bigtools/src/utils/cli/bigwigmerge.rs @@ -10,7 +10,7 @@ use thiserror::Error; use crate::utils::chromvalues::ChromValues; use crate::utils::merge::merge_sections_many; use crate::utils::reopen::ReopenableFile; -use crate::{BBIReadError, BigWigRead, BigWigWrite, ChromProcess, InternalProcessData}; +use crate::{BBIReadError, BigWigRead, BigWigWrite, ChromProcess}; use crate::{ChromData, ChromDataState, ChromProcessingKey, ProcessChromError}; use crate::{ChromData2, Value}; use tokio::runtime; @@ -395,12 +395,7 @@ impl ChromData2 for ChromGroupReadImpl { type Values = MergingValues; fn process_to_bbi< P: ChromProcess::Value>, - StartProcessing: FnMut( - String, - ) -> Result< - InternalProcessData, - ProcessChromError<::Error>, - >, + StartProcessing: FnMut(String) -> Result::Error>>, Advance: FnMut(crate::ChromProcessedData), >( &mut self, @@ -413,8 +408,8 @@ impl ChromData2 for ChromGroupReadImpl { self.iter.next(); match next { Some(Ok((chrom, _, group))) => { - let internal_data = start_processing(chrom)?; - let read = P::do_process(internal_data, group); + let p = start_processing(chrom)?; + let read = p.do_process(group); let data = runtime.block_on(read)?; advance(data); } From ba99b5b772d15f778d02005f1a8caf817c7587c1 Mon Sep 17 00:00:00 2001 From: Jack Huey <31162821+jackh726@users.noreply.github.com> Date: Fri, 19 Apr 2024 00:11:47 -0400 Subject: [PATCH 06/31] Inline process_chrom fns and move beginnings to create fn --- bigtools/src/bbi/bigbedwrite.rs | 466 +++++++++++--------------- bigtools/src/bbi/bigwigwrite.rs | 207 ++++++------ bigtools/src/utils/cli/bedtobigbed.rs | 2 +- 3 files changed, 318 insertions(+), 357 deletions(-) diff --git a/bigtools/src/bbi/bigbedwrite.rs b/bigtools/src/bbi/bigbedwrite.rs index f7cc83a..3a98ed7 100644 --- a/bigtools/src/bbi/bigbedwrite.rs +++ b/bigtools/src/bbi/bigbedwrite.rs @@ -6,7 +6,6 @@ use std::io::{self, BufWriter, Write}; use futures::future::FutureExt; use futures::sink::SinkExt; -use futures::Future; use byteorder::{NativeEndian, WriteBytesExt}; use tokio::runtime::{Handle, Runtime}; @@ -81,71 +80,11 @@ impl BigBedWrite { } /// Write the values from `V` as a bigWig. Will utilize the provided runtime for encoding values and for reading through the values (potentially parallelized by chromosome). - pub fn write< - Values: ChromValues + Send + 'static, - V: ChromData, - >( + pub fn write, V: ChromData>( self, chrom_sizes: HashMap, vals: V, runtime: Runtime, - ) -> Result<(), ProcessChromError> { - let process_chrom = |zooms_channels: Vec<(u32, ChromProcessingInputSectionChannel)>, - ftx: ChromProcessingInputSectionChannel, - chrom_id: u32, - options: BBIWriteOptions, - runtime: Handle, - chrom_values: Values, - chrom: String, - chrom_length: u32| { - let fut = BigBedWrite::process_chrom( - zooms_channels, - ftx, - chrom_id, - options, - runtime.clone(), - chrom_values, - chrom, - chrom_length, - ); - runtime.spawn(fut).map(|f| f.unwrap()) - }; - self.write_internal(chrom_sizes, vals, runtime, process_chrom) - } - - /// Write the values from `V` as a bigWig. Will utilize the provided runtime for encoding values, but will read through values on the current thread. - pub fn write_singlethreaded< - Values: ChromValues, - V: ChromData, - >( - self, - chrom_sizes: HashMap, - vals: V, - runtime: Runtime, - ) -> Result<(), ProcessChromError> { - self.write_internal(chrom_sizes, vals, runtime, BigBedWrite::process_chrom) - } - - fn write_internal< - Values: ChromValues, - V: ChromData, - Fut: Future>>, - G: Fn( - Vec<(u32, ChromProcessingInputSectionChannel)>, - ChromProcessingInputSectionChannel, - u32, - BBIWriteOptions, - Handle, - Values, - String, - u32, - ) -> Fut, - >( - self, - chrom_sizes: HashMap, - vals: V, - runtime: Runtime, - process_chrom: G, ) -> Result<(), ProcessChromError> { let fp = File::create(self.path.clone())?; let mut file = BufWriter::new(fp); @@ -227,7 +166,7 @@ impl BigBedWrite { chrom_values: Values, chrom: String, chrom_length: u32| { - let fut = BigBedWrite::process_chrom_no_zooms( + let fut = process_chrom_no_zooms( ftx, chrom_id, options, @@ -267,7 +206,7 @@ impl BigBedWrite { let output = bbiwrite::write_zoom_vals( vals, self.options, - BigBedWrite::process_chrom_zoom, + process_chrom_zoom, &runtime, &chrom_ids, (summary.bases_covered as f64 / summary.total_items as f64) as u32, @@ -306,7 +245,8 @@ impl BigBedWrite { chrom: &String, chrom_values: &mut I, summary: &mut Option, - state_val: &mut EntriesSection, + items: &mut Vec, + overlap: &mut IndexList, options: BBIWriteOptions, runtime: &Handle, ftx: &mut ChromProcessingInputSectionChannel, @@ -439,7 +379,7 @@ impl BigBedWrite { }; add_interval_to_summary( - &mut state_val.overlap, + overlap, summary, current_val.start, current_val.end, @@ -447,13 +387,10 @@ impl BigBedWrite { ); // Then, add the current item to the actual values, and encode if full, or last item - state_val.items.push(current_val); - if chrom_values.peek().is_none() || state_val.items.len() >= options.items_per_slot as usize - { - let items = std::mem::replace( - &mut state_val.items, - Vec::with_capacity(options.items_per_slot as usize), - ); + items.push(current_val); + if chrom_values.peek().is_none() || items.len() >= options.items_per_slot as usize { + let items = + std::mem::replace(items, Vec::with_capacity(options.items_per_slot as usize)); let handle = runtime .spawn(encode_section(options.compress, items, chrom_id)) .map(|f| f.unwrap()); @@ -626,26 +563,42 @@ impl BigBedWrite { Ok(()) } +} +// While we do technically lose precision here by using the f32 in Value, we can reuse the same merge_into method +struct ZoomItem { + size: u32, + live_info: Option<(ZoomRecord, u64)>, + overlap: IndexList, + records: Vec, + channel: ChromProcessingInputSectionChannel, +} +struct EntriesSection { + items: Vec, + overlap: IndexList, + zoom_items: Vec, +} + +pub(crate) struct BigBedFullProcess { + summary: Option, + state_val: EntriesSection, + total_items: u64, + + ftx: ChromProcessingInputSectionChannel, + chrom_id: u32, + options: BBIWriteOptions, + runtime: Handle, + chrom: String, + length: u32, +} + +impl ChromProcess for BigBedFullProcess { + type Value = BedEntry; + fn create(internal_data: InternalProcessData) -> Self { + let InternalProcessData(zooms_channels, ftx, chrom_id, options, runtime, chrom, length) = + internal_data; - async fn process_chrom( - zooms_channels: Vec<(u32, ChromProcessingInputSectionChannel)>, - mut ftx: ChromProcessingInputSectionChannel, - chrom_id: u32, - options: BBIWriteOptions, - runtime: Handle, - mut chrom_values: I, - chrom: String, - chrom_length: u32, - ) -> Result> - where - I: ChromValues, - { let mut summary: Option = None; - let mut state_val = EntriesSection { - items: Vec::with_capacity(options.items_per_slot as usize), - overlap: IndexList::new(), - }; let mut zoom_items = zooms_channels .into_iter() .map(|(size, channel)| ZoomItem { @@ -656,8 +609,41 @@ impl BigBedWrite { channel, }) .collect(); + let mut state_val = EntriesSection { + zoom_items, + items: Vec::with_capacity(options.items_per_slot as usize), + overlap: IndexList::new(), + }; let mut total_items = 0; - while let Some(current_val) = chrom_values.next() { + BigBedFullProcess { + summary, + state_val, + total_items, + ftx, + chrom_id, + options, + runtime, + chrom, + length, + } + } + async fn do_process>( + self, + mut data: Values, + ) -> Result> { + let Self { + mut summary, + mut total_items, + mut state_val, + mut ftx, + chrom_id, + options, + runtime, + chrom, + length, + } = self; + + while let Some(current_val) = data.next() { // If there is a source error, propogate that up let current_val = current_val.map_err(ProcessChromError::SourceError)?; total_items += 1; @@ -667,11 +653,12 @@ impl BigBedWrite { BigBedWrite::process_val( current_val, - chrom_length, + length, &chrom, - &mut chrom_values, + &mut data, &mut summary, - &mut state_val, + &mut state_val.items, + &mut state_val.overlap, options, &runtime, &mut ftx, @@ -680,11 +667,11 @@ impl BigBedWrite { .await?; BigBedWrite::process_val_zoom( - &mut zoom_items, + &mut state_val.zoom_items, options, item_start, item_end, - &mut chrom_values, + &mut data, &runtime, chrom_id, ) @@ -692,7 +679,7 @@ impl BigBedWrite { } debug_assert!(state_val.items.is_empty()); - for zoom_item in zoom_items.iter_mut() { + for zoom_item in state_val.zoom_items.iter_mut() { debug_assert!(zoom_item.live_info.is_none()); debug_assert!(zoom_item.records.is_empty()); } @@ -709,184 +696,141 @@ impl BigBedWrite { Some(summary) => summary, }; summary_complete.total_items = total_items; - Ok(summary_complete) + Ok(ChromProcessedData(summary_complete)) } +} - pub(crate) async fn process_chrom_no_zooms>( - mut ftx: ChromProcessingInputSectionChannel, - chrom_id: u32, - options: BBIWriteOptions, - runtime: Handle, - mut chrom_values: I, - chrom: String, - chrom_length: u32, - ) -> Result<(Summary, Vec<(u64, u64)>), ProcessChromError> { - #[derive(Debug, Copy, Clone)] - struct ZoomCounts { - resolution: u64, - current_end: u64, - counts: u64, - } - - let mut summary: Option = None; - - let mut state_val = EntriesSection { - items: Vec::with_capacity(options.items_per_slot as usize), - overlap: IndexList::new(), - }; - let mut zoom_counts: Vec = std::iter::successors(Some(10), |z| Some(z * 4)) - .take_while(|z| *z <= u64::MAX / 4 && *z <= chrom_length as u64 * 4) - .map(|z| ZoomCounts { - resolution: z, - current_end: 0, - counts: 0, - }) - .collect(); - - let mut total_items = 0; - while let Some(current_val) = chrom_values.next() { - // If there is a source error, propogate that up - let current_val = current_val.map_err(ProcessChromError::SourceError)?; - total_items += 1; - - let item_start = current_val.start; - let item_end = current_val.end; - - BigBedWrite::process_val( - current_val, - chrom_length, - &chrom, - &mut chrom_values, - &mut summary, - &mut state_val, - options, - &runtime, - &mut ftx, - chrom_id, - ) - .await?; +pub(crate) async fn process_chrom_no_zooms>( + mut ftx: ChromProcessingInputSectionChannel, + chrom_id: u32, + options: BBIWriteOptions, + runtime: Handle, + mut chrom_values: I, + chrom: String, + chrom_length: u32, +) -> Result<(Summary, Vec<(u64, u64)>), ProcessChromError> { + #[derive(Debug, Copy, Clone)] + struct ZoomCounts { + resolution: u64, + current_end: u64, + counts: u64, + } - for zoom in &mut zoom_counts { - if item_start as u64 >= zoom.current_end { - zoom.counts += 1; - zoom.current_end = item_start as u64 + zoom.resolution; - } - while item_end as u64 > zoom.current_end { - zoom.counts += 1; - zoom.current_end += zoom.resolution; - } + let mut summary: Option = None; + + let mut items = Vec::with_capacity(options.items_per_slot as usize); + let mut overlap = IndexList::new(); + let mut zoom_counts: Vec = std::iter::successors(Some(10), |z| Some(z * 4)) + .take_while(|z| *z <= u64::MAX / 4 && *z <= chrom_length as u64 * 4) + .map(|z| ZoomCounts { + resolution: z, + current_end: 0, + counts: 0, + }) + .collect(); + + let mut total_items = 0; + while let Some(current_val) = chrom_values.next() { + // If there is a source error, propogate that up + let current_val = current_val.map_err(ProcessChromError::SourceError)?; + total_items += 1; + + let item_start = current_val.start; + let item_end = current_val.end; + + BigBedWrite::process_val( + current_val, + chrom_length, + &chrom, + &mut chrom_values, + &mut summary, + &mut items, + &mut overlap, + options, + &runtime, + &mut ftx, + chrom_id, + ) + .await?; + + for zoom in &mut zoom_counts { + if item_start as u64 >= zoom.current_end { + zoom.counts += 1; + zoom.current_end = item_start as u64 + zoom.resolution; + } + while item_end as u64 > zoom.current_end { + zoom.counts += 1; + zoom.current_end += zoom.resolution; } } - - debug_assert!(state_val.items.is_empty()); - - let mut summary_complete = match summary { - None => Summary { - total_items: 0, - bases_covered: 0, - min_val: 0.0, - max_val: 0.0, - sum: 0.0, - sum_squares: 0.0, - }, - Some(summary) => summary, - }; - summary_complete.total_items = total_items; - - let zoom_counts = zoom_counts - .into_iter() - .map(|z| (z.resolution, z.counts)) - .collect(); - - Ok((summary_complete, zoom_counts)) } - pub(crate) async fn process_chrom_zoom>( - zooms_channels: Vec<(u32, ChromProcessingInputSectionChannel)>, - chrom_id: u32, - options: BBIWriteOptions, - runtime: Handle, - mut chrom_values: I, - ) -> Result<(), ProcessChromError> { - let mut zoom_items: Vec = zooms_channels - .into_iter() - .map(|(size, channel)| ZoomItem { - size, - live_info: None, - overlap: IndexList::new(), - records: Vec::with_capacity(options.items_per_slot as usize), - channel, - }) - .collect(); - - while let Some(current_val) = chrom_values.next() { - // If there is a source error, propogate that up - let current_val = current_val.map_err(ProcessChromError::SourceError)?; + debug_assert!(items.is_empty()); - let item_start = current_val.start; - let item_end = current_val.end; - - BigBedWrite::process_val_zoom( - &mut zoom_items, - options, - item_start, - item_end, - &mut chrom_values, - &runtime, - chrom_id, - ) - .await?; - } - - for zoom_item in zoom_items.iter_mut() { - debug_assert!(zoom_item.live_info.is_none()); - debug_assert!(zoom_item.records.is_empty()); - } + let mut summary_complete = match summary { + None => Summary { + total_items: 0, + bases_covered: 0, + min_val: 0.0, + max_val: 0.0, + sum: 0.0, + sum_squares: 0.0, + }, + Some(summary) => summary, + }; + summary_complete.total_items = total_items; - Ok(()) - } -} + let zoom_counts = zoom_counts + .into_iter() + .map(|z| (z.resolution, z.counts)) + .collect(); -// While we do technically lose precision here by using the f32 in Value, we can reuse the same merge_into method -struct ZoomItem { - size: u32, - live_info: Option<(ZoomRecord, u64)>, - overlap: IndexList, - records: Vec, - channel: ChromProcessingInputSectionChannel, + Ok((summary_complete, zoom_counts)) } -struct EntriesSection { - items: Vec, - overlap: IndexList, -} - -pub(crate) struct BigBedFullProcess(InternalProcessData); -impl ChromProcess for BigBedFullProcess { - type Value = BedEntry; - fn create(internal_data: InternalProcessData) -> Self { - BigBedFullProcess(internal_data) +pub(crate) async fn process_chrom_zoom>( + zooms_channels: Vec<(u32, ChromProcessingInputSectionChannel)>, + chrom_id: u32, + options: BBIWriteOptions, + runtime: Handle, + mut chrom_values: I, +) -> Result<(), ProcessChromError> { + let mut zoom_items: Vec = zooms_channels + .into_iter() + .map(|(size, channel)| ZoomItem { + size, + live_info: None, + overlap: IndexList::new(), + records: Vec::with_capacity(options.items_per_slot as usize), + channel, + }) + .collect(); + + while let Some(current_val) = chrom_values.next() { + // If there is a source error, propogate that up + let current_val = current_val.map_err(ProcessChromError::SourceError)?; + + let item_start = current_val.start; + let item_end = current_val.end; + + BigBedWrite::process_val_zoom( + &mut zoom_items, + options, + item_start, + item_end, + &mut chrom_values, + &runtime, + chrom_id, + ) + .await?; } - async fn do_process>( - self, - data: Values, - ) -> Result> { - let InternalProcessData(zooms_channels, ftx, chrom_id, options, runtime, chrom, length) = - self.0; - Ok(ChromProcessedData( - BigBedWrite::process_chrom( - zooms_channels, - ftx, - chrom_id, - options, - runtime, - data, - chrom, - length, - ) - .await?, - )) + + for zoom_item in zoom_items.iter_mut() { + debug_assert!(zoom_item.live_info.is_none()); + debug_assert!(zoom_item.records.is_empty()); } + + Ok(()) } async fn encode_section( diff --git a/bigtools/src/bbi/bigwigwrite.rs b/bigtools/src/bbi/bigwigwrite.rs index c488090..0ade459 100644 --- a/bigtools/src/bbi/bigwigwrite.rs +++ b/bigtools/src/bbi/bigwigwrite.rs @@ -554,18 +554,18 @@ impl BigWigWrite { Ok(()) } - async fn process_val>( + async fn process_val( current_val: Value, + next_val: Option, chrom_length: u32, chrom: &String, - chrom_values: &mut I, summary: &mut Summary, items: &mut Vec, options: BBIWriteOptions, runtime: &Handle, ftx: &mut ChromProcessingInputSectionChannel, chrom_id: u32, - ) -> Result<(), ProcessChromError> { + ) -> Result<(), ProcessChromError> { // Check a few preconditions: // - The current end is greater than or equal to the start // - The current end is at most the chromosome length @@ -583,9 +583,9 @@ impl BigWigWrite { current_val.end, chrom, chrom_length ))); } - match chrom_values.peek() { - None | Some(Err(_)) => (), - Some(Ok(next_val)) => { + match next_val { + None => {} + Some(next_val) => { if current_val.end > next_val.start { return Err(ProcessChromError::InvalidInput(format!( "Invalid bed graph: overlapping values on chromosome {} at {}-{} and {}-{}", @@ -609,7 +609,7 @@ impl BigWigWrite { // Then, add the current item to the actual values, and encode if full, or last item items.push(current_val); - if chrom_values.peek().is_none() || items.len() >= options.items_per_slot as usize { + if next_val.is_none() || items.len() >= options.items_per_slot as usize { let items = std::mem::take(items); let handle = runtime .spawn(encode_section(options.compress, items, chrom_id)) @@ -708,78 +708,6 @@ impl BigWigWrite { Ok(()) } - pub(crate) async fn process_chrom>( - zooms_channels: Vec<(u32, ChromProcessingInputSectionChannel)>, - mut ftx: ChromProcessingInputSectionChannel, - chrom_id: u32, - options: BBIWriteOptions, - runtime: Handle, - mut chrom_values: I, - chrom: String, - chrom_length: u32, - ) -> Result> { - let mut summary = Summary { - total_items: 0, - bases_covered: 0, - min_val: f64::MAX, - max_val: f64::MIN, - sum: 0.0, - sum_squares: 0.0, - }; - - let mut items = Vec::with_capacity(options.items_per_slot as usize); - let mut zoom_items: Vec = zooms_channels - .into_iter() - .map(|(size, channel)| ZoomItem { - size, - live_info: None, - records: Vec::with_capacity(options.items_per_slot as usize), - channel, - }) - .collect(); - - while let Some(current_val) = chrom_values.next() { - // If there is a source error, propogate that up - let current_val = current_val.map_err(ProcessChromError::SourceError)?; - - BigWigWrite::process_val( - current_val, - chrom_length, - &chrom, - &mut chrom_values, - &mut summary, - &mut items, - options, - &runtime, - &mut ftx, - chrom_id, - ) - .await?; - - BigWigWrite::process_val_zoom( - &mut zoom_items, - options, - current_val, - &mut chrom_values, - &runtime, - chrom_id, - ) - .await?; - } - - debug_assert!(items.is_empty()); - for zoom_item in zoom_items.iter_mut() { - debug_assert!(zoom_item.live_info.is_none()); - debug_assert!(zoom_item.records.is_empty()); - } - - if summary.total_items == 0 { - summary.min_val = 0.0; - summary.max_val = 0.0; - } - Ok(summary) - } - pub(crate) async fn process_chrom_no_zooms>( mut ftx: ChromProcessingInputSectionChannel, chrom_id: u32, @@ -818,12 +746,16 @@ impl BigWigWrite { while let Some(current_val) = chrom_values.next() { // If there is a source error, propogate that up let current_val = current_val.map_err(ProcessChromError::SourceError)?; + let next_val = match chrom_values.peek() { + None | Some(Err(_)) => None, + Some(Ok(v)) => Some(*v), + }; BigWigWrite::process_val( current_val, + next_val, chrom_length, &chrom, - &mut chrom_values, &mut summary, &mut items, options, @@ -901,33 +833,118 @@ impl BigWigWrite { } } -pub(crate) struct BigWigFullProcess(InternalProcessData); +pub(crate) struct BigWigFullProcess { + summary: Summary, + items: Vec, + zoom_items: Vec, + + ftx: ChromProcessingInputSectionChannel, + chrom_id: u32, + options: BBIWriteOptions, + runtime: Handle, + chrom: String, + length: u32, +} impl ChromProcess for BigWigFullProcess { type Value = Value; fn create(internal_data: InternalProcessData) -> Self { - BigWigFullProcess(internal_data) + let InternalProcessData(zooms_channels, mut ftx, chrom_id, options, runtime, chrom, length) = + internal_data; + + let mut summary = Summary { + total_items: 0, + bases_covered: 0, + min_val: f64::MAX, + max_val: f64::MIN, + sum: 0.0, + sum_squares: 0.0, + }; + + let mut items = Vec::with_capacity(options.items_per_slot as usize); + let mut zoom_items: Vec = zooms_channels + .into_iter() + .map(|(size, channel)| ZoomItem { + size, + live_info: None, + records: Vec::with_capacity(options.items_per_slot as usize), + channel, + }) + .collect(); + + BigWigFullProcess { + summary, + items, + zoom_items, + ftx, + chrom_id, + options, + runtime, + chrom, + length, + } } async fn do_process>( self, - data: Values, + mut data: Values, ) -> Result> { - let InternalProcessData(zooms_channels, ftx, chrom_id, options, runtime, chrom, length) = - self.0; - Ok(ChromProcessedData( - BigWigWrite::process_chrom( - zooms_channels, - ftx, + let Self { + mut summary, + mut items, + mut zoom_items, + mut ftx, + chrom_id, + options, + runtime, + chrom, + length, + } = self; + + while let Some(current_val) = data.next() { + // If there is a source error, propogate that up + let current_val = current_val.map_err(ProcessChromError::SourceError)?; + let next_val = match data.peek() { + None | Some(Err(_)) => None, + Some(Ok(v)) => Some(*v), + }; + + BigWigWrite::process_val( + current_val, + next_val, + length, + &chrom, + &mut summary, + &mut items, + options, + &runtime, + &mut ftx, chrom_id, + ) + .await?; + + BigWigWrite::process_val_zoom( + &mut zoom_items, options, - runtime, - data, - chrom, - length, + current_val, + &mut data, + &runtime, + chrom_id, ) - .await?, - )) + .await?; + } + + debug_assert!(items.is_empty()); + for zoom_item in zoom_items.iter_mut() { + debug_assert!(zoom_item.live_info.is_none()); + debug_assert!(zoom_item.records.is_empty()); + } + + if summary.total_items == 0 { + summary.min_val = 0.0; + summary.max_val = 0.0; + } + Ok(ChromProcessedData(summary)) } } diff --git a/bigtools/src/utils/cli/bedtobigbed.rs b/bigtools/src/utils/cli/bedtobigbed.rs index 443cbea..8e3b10f 100644 --- a/bigtools/src/utils/cli/bedtobigbed.rs +++ b/bigtools/src/utils/cli/bedtobigbed.rs @@ -124,7 +124,7 @@ pub fn bedtobigbed(args: BedToBigBedArgs) -> Result<(), Box> { let vals_iter = BedParser::from_bed_file(stdin); let chsi = BedParserStreamingIterator::new(vals_iter, allow_out_of_order_chroms); - outb.write_singlethreaded(chrom_map, chsi, runtime)?; + outb.write(chrom_map, chsi, runtime)?; } else { let infile = File::open(&bedpath)?; let (parallel, parallel_required) = match (nthreads, args.parallel.as_ref()) { From 392404eafff69e84c1c3287c6788f86f91d26b98 Mon Sep 17 00:00:00 2001 From: Jack Huey <31162821+jackh726@users.noreply.github.com> Date: Fri, 19 Apr 2024 00:35:40 -0400 Subject: [PATCH 07/31] Pass back P and destroy to get data --- bigtools/src/bbi/bbiwrite.rs | 74 ++++++++++------------- bigtools/src/bbi/bedchromdata.rs | 28 +++++---- bigtools/src/bbi/bigbedwrite.rs | 86 +++++++++++++++------------ bigtools/src/bbi/bigwigwrite.rs | 69 ++++++++++++--------- bigtools/src/utils/cli/bigwigmerge.rs | 8 +-- 5 files changed, 141 insertions(+), 124 deletions(-) diff --git a/bigtools/src/bbi/bbiwrite.rs b/bigtools/src/bbi/bbiwrite.rs index 7be81cd..8ceb1a6 100644 --- a/bigtools/src/bbi/bbiwrite.rs +++ b/bigtools/src/bbi/bbiwrite.rs @@ -586,9 +586,9 @@ pub trait ChromData2: Sized { type Values: ChromValues; fn process_to_bbi< - P: ChromProcess::Value>, + P: ChromProcess::Value> + Send + 'static, StartProcessing: FnMut(String) -> Result::Error>>, - Advance: FnMut(ChromProcessedData), + Advance: FnMut(P), >( &mut self, runtime: &Handle, @@ -724,16 +724,17 @@ pub struct InternalProcessData( pub(crate) trait ChromProcess { type Value; fn create(internal_data: InternalProcessData) -> Self; + fn destroy(self) -> ChromProcessedData; async fn do_process>( - self, + &mut self, data: Values, - ) -> Result>; + ) -> Result<(), ProcessChromError>; } pub(crate) fn write_vals< Values: ChromValues, - V: ChromData, - P: ChromProcess, + V: ChromData2, + P: ChromProcess + Send + 'static, >( mut vals_iter: V, file: BufWriter, @@ -765,12 +766,13 @@ pub(crate) fn write_vals< let mut chrom_ids = IdMap::default(); - let mut key = 0; - let mut output: BTreeMap = BTreeMap::new(); - let mut summary: Option = None; let (send, recv) = futures_mpsc::unbounded(); let write_fut = write_chroms_with_zooms(file, zooms_map, recv); + let (write_fut, write_fut_handle) = write_fut.remote_handle(); + runtime.spawn(write_fut); + + let handle = runtime.handle(); let setup_chrom = || { let (ftx, sections_handle, buf, section_receiver) = @@ -805,10 +807,7 @@ pub(crate) fn write_vals< (zooms_channels, ftx) }; - let mut do_read = |chrom: String, - data: _, - output: &mut BTreeMap| - -> Result> { + let mut do_read = |chrom: String| -> Result<_, ProcessChromError<_>> { let length = match chrom_sizes.get(&chrom) { Some(length) => *length, None => { @@ -823,7 +822,7 @@ pub(crate) fn write_vals< let (zooms_channels, ftx) = setup_chrom(); - let internal_data = InternalProcessData( + let internal_data = crate::InternalProcessData( zooms_channels, ftx, chrom_id, @@ -832,40 +831,27 @@ pub(crate) fn write_vals< chrom, length, ); - let mut p = P::create(internal_data); - let fut = p.do_process(data); - - let curr_key = key; - key += 1; - - output.insert(curr_key, fut); - - Ok(ChromProcessingKey(curr_key)) + Ok(P::create(internal_data)) }; - let (write_fut, write_fut_handle) = write_fut.remote_handle(); - runtime.spawn(write_fut); - loop { - match vals_iter.advance(&mut do_read, &mut output)? { - ChromDataState::NewChrom(read) => { - let fut = output.remove(&read.0).unwrap(); - let chrom_summary = runtime.block_on(fut)?.0; - match &mut summary { - None => summary = Some(chrom_summary), - Some(summary) => { - summary.total_items += chrom_summary.total_items; - summary.bases_covered += chrom_summary.bases_covered; - summary.min_val = summary.min_val.min(chrom_summary.min_val); - summary.max_val = summary.max_val.max(chrom_summary.max_val); - summary.sum += chrom_summary.sum; - summary.sum_squares += chrom_summary.sum_squares; - } - } + let mut advance = |p: P| { + let data = p.destroy(); + let ChromProcessedData(chrom_summary) = data; + match &mut summary { + None => summary = Some(chrom_summary), + Some(summary) => { + summary.total_items += chrom_summary.total_items; + summary.bases_covered += chrom_summary.bases_covered; + summary.min_val = summary.min_val.min(chrom_summary.min_val); + summary.max_val = summary.max_val.max(chrom_summary.max_val); + summary.sum += chrom_summary.sum; + summary.sum_squares += chrom_summary.sum_squares; } - ChromDataState::Finished => break, - ChromDataState::Error(err) => return Err(ProcessChromError::SourceError(err)), } - } + }; + + vals_iter.process_to_bbi(handle, &mut do_read, &mut advance)?; + drop(send); let summary_complete = summary.unwrap_or(Summary { diff --git a/bigtools/src/bbi/bedchromdata.rs b/bigtools/src/bbi/bedchromdata.rs index 7be771f..1f917e6 100644 --- a/bigtools/src/bbi/bedchromdata.rs +++ b/bigtools/src/bbi/bedchromdata.rs @@ -82,7 +82,7 @@ impl ChromData2 for BedParserStreamingIterator { fn process_to_bbi< P: ChromProcess::Value>, StartProcessing: FnMut(String) -> Result::Error>>, - Advance: FnMut(crate::ChromProcessedData), + Advance: FnMut(P), >( &mut self, runtime: &Handle, @@ -101,10 +101,10 @@ impl ChromData2 for BedParserStreamingIterator { } } - let p = start_processing(chrom)?; + let mut p = start_processing(chrom)?; let read = p.do_process(group); - let data = runtime.block_on(read)?; - advance(data); + runtime.block_on(read)?; + advance(p); } Some(Err(e)) => return Err(ProcessChromError::SourceError(e)), None => break, @@ -220,15 +220,15 @@ impl ChromData } } -impl ChromData2 +impl ChromData2 for BedParserParallelStreamingIterator, BedValueError> { type Values = BedChromData>>; fn process_to_bbi< - P: ChromProcess::Value>, + P: ChromProcess::Value> + Send + 'static, StartProcessing: FnMut(String) -> Result::Error>>, - Advance: FnMut(crate::ChromProcessedData), + Advance: FnMut(P), >( &mut self, runtime: &Handle, @@ -267,9 +267,13 @@ impl ChromData2 } } - let p = start_processing(chrom)?; - let read = p.do_process(group); - let data = runtime.spawn(read); + let mut p = start_processing(chrom)?; + let data: tokio::task::JoinHandle< + Result>, + > = runtime.spawn(async move { + p.do_process(group).await?; + Ok(p) + }); queued_reads.push_back(data); } Some(Err(e)) => return Err(ProcessChromError::SourceError(e)), @@ -281,8 +285,8 @@ impl ChromData2 let Some(next_chrom) = queued_reads.pop_front() else { break; }; - let data = runtime.block_on(next_chrom).unwrap()?; - advance(data); + let p = runtime.block_on(next_chrom).unwrap()?; + advance(p); } Ok(()) diff --git a/bigtools/src/bbi/bigbedwrite.rs b/bigtools/src/bbi/bigbedwrite.rs index 3a98ed7..034b79b 100644 --- a/bigtools/src/bbi/bigbedwrite.rs +++ b/bigtools/src/bbi/bigbedwrite.rs @@ -14,8 +14,8 @@ use crate::utils::chromvalues::ChromValues; use crate::utils::indexlist::IndexList; use crate::utils::tell::Tell; use crate::{ - write_info, ChromData, ChromProcess, ChromProcessedData, ChromProcessingInputSectionChannel, - InternalProcessData, + write_info, ChromData, ChromData2, ChromProcess, ChromProcessedData, + ChromProcessingInputSectionChannel, InternalProcessData, }; use crate::bbi::{BedEntry, Summary, Value, ZoomRecord, BIGBED_MAGIC}; @@ -80,7 +80,7 @@ impl BigBedWrite { } /// Write the values from `V` as a bigWig. Will utilize the provided runtime for encoding values and for reading through the values (potentially parallelized by chromosome). - pub fn write, V: ChromData>( + pub fn write, V: ChromData2>( self, chrom_sizes: HashMap, vals: V, @@ -593,13 +593,41 @@ pub(crate) struct BigBedFullProcess { impl ChromProcess for BigBedFullProcess { type Value = BedEntry; + fn destroy(self) -> ChromProcessedData { + let Self { + summary, + total_items, + state_val, + .. + } = self; + + debug_assert!(state_val.items.is_empty()); + for zoom_item in state_val.zoom_items.iter() { + debug_assert!(zoom_item.live_info.is_none()); + debug_assert!(zoom_item.records.is_empty()); + } + + let mut summary_complete = match summary { + None => Summary { + total_items: 0, + bases_covered: 0, + min_val: 0.0, + max_val: 0.0, + sum: 0.0, + sum_squares: 0.0, + }, + Some(summary) => summary, + }; + summary_complete.total_items = total_items; + ChromProcessedData(summary_complete) + } fn create(internal_data: InternalProcessData) -> Self { let InternalProcessData(zooms_channels, ftx, chrom_id, options, runtime, chrom, length) = internal_data; - let mut summary: Option = None; + let summary: Option = None; - let mut zoom_items = zooms_channels + let zoom_items = zooms_channels .into_iter() .map(|(size, channel)| ZoomItem { size, @@ -609,12 +637,12 @@ impl ChromProcess for BigBedFullProcess { channel, }) .collect(); - let mut state_val = EntriesSection { + let state_val = EntriesSection { zoom_items, items: Vec::with_capacity(options.items_per_slot as usize), overlap: IndexList::new(), }; - let mut total_items = 0; + let total_items = 0; BigBedFullProcess { summary, state_val, @@ -628,25 +656,27 @@ impl ChromProcess for BigBedFullProcess { } } async fn do_process>( - self, + &mut self, mut data: Values, - ) -> Result> { + ) -> Result<(), ProcessChromError> { let Self { - mut summary, - mut total_items, - mut state_val, - mut ftx, + summary, + total_items, + state_val, + ftx, chrom_id, options, runtime, chrom, length, } = self; + let chrom_id = *chrom_id; + let length = *length; while let Some(current_val) = data.next() { // If there is a source error, propogate that up let current_val = current_val.map_err(ProcessChromError::SourceError)?; - total_items += 1; + *total_items += 1; let item_start = current_val.start; let item_end = current_val.end; @@ -656,19 +686,19 @@ impl ChromProcess for BigBedFullProcess { length, &chrom, &mut data, - &mut summary, + summary, &mut state_val.items, &mut state_val.overlap, - options, + *options, &runtime, - &mut ftx, + ftx, chrom_id, ) .await?; BigBedWrite::process_val_zoom( &mut state_val.zoom_items, - options, + *options, item_start, item_end, &mut data, @@ -678,25 +708,7 @@ impl ChromProcess for BigBedFullProcess { .await?; } - debug_assert!(state_val.items.is_empty()); - for zoom_item in state_val.zoom_items.iter_mut() { - debug_assert!(zoom_item.live_info.is_none()); - debug_assert!(zoom_item.records.is_empty()); - } - - let mut summary_complete = match summary { - None => Summary { - total_items: 0, - bases_covered: 0, - min_val: 0.0, - max_val: 0.0, - sum: 0.0, - sum_squares: 0.0, - }, - Some(summary) => summary, - }; - summary_complete.total_items = total_items; - Ok(ChromProcessedData(summary_complete)) + Ok(()) } } diff --git a/bigtools/src/bbi/bigwigwrite.rs b/bigtools/src/bbi/bigwigwrite.rs index 0ade459..39cb2b9 100644 --- a/bigtools/src/bbi/bigwigwrite.rs +++ b/bigtools/src/bbi/bigwigwrite.rs @@ -214,7 +214,8 @@ impl BigWigWrite { Ok(BigWigFullProcess::create(internal_data)) }; - let mut advance = |data: ChromProcessedData| { + let mut advance = |p: BigWigFullProcess| { + let data = p.destroy(); let ChromProcessedData(chrom_summary) = data; match &mut summary { None => summary = Some(chrom_summary), @@ -343,7 +344,8 @@ impl BigWigWrite { Ok(BigWigFullProcess::create(internal_data)) }; - let mut advance = |data: ChromProcessedData| { + let mut advance = |p: BigWigFullProcess| { + let data = p.destroy(); let ChromProcessedData(chrom_summary) = data; match &mut summary { None => summary = Some(chrom_summary), @@ -849,10 +851,10 @@ pub(crate) struct BigWigFullProcess { impl ChromProcess for BigWigFullProcess { type Value = Value; fn create(internal_data: InternalProcessData) -> Self { - let InternalProcessData(zooms_channels, mut ftx, chrom_id, options, runtime, chrom, length) = + let InternalProcessData(zooms_channels, ftx, chrom_id, options, runtime, chrom, length) = internal_data; - let mut summary = Summary { + let summary = Summary { total_items: 0, bases_covered: 0, min_val: f64::MAX, @@ -861,8 +863,8 @@ impl ChromProcess for BigWigFullProcess { sum_squares: 0.0, }; - let mut items = Vec::with_capacity(options.items_per_slot as usize); - let mut zoom_items: Vec = zooms_channels + let items = Vec::with_capacity(options.items_per_slot as usize); + let zoom_items: Vec = zooms_channels .into_iter() .map(|(size, channel)| ZoomItem { size, @@ -884,22 +886,45 @@ impl ChromProcess for BigWigFullProcess { length, } } + fn destroy(self) -> ChromProcessedData { + let Self { + mut summary, + items, + zoom_items, + .. + } = self; + + debug_assert!(items.is_empty()); + for zoom_item in zoom_items.iter() { + debug_assert!(zoom_item.live_info.is_none()); + debug_assert!(zoom_item.records.is_empty()); + } + + if summary.total_items == 0 { + summary.min_val = 0.0; + summary.max_val = 0.0; + } + ChromProcessedData(summary) + } async fn do_process>( - self, + &mut self, mut data: Values, - ) -> Result> { + ) -> Result<(), ProcessChromError> { let Self { - mut summary, - mut items, - mut zoom_items, - mut ftx, + summary, + items, + zoom_items, + ftx, chrom_id, options, runtime, chrom, length, } = self; + let chrom_id = *chrom_id; + let options = *options; + let length = *length; while let Some(current_val) = data.next() { // If there is a source error, propogate that up @@ -914,17 +939,17 @@ impl ChromProcess for BigWigFullProcess { next_val, length, &chrom, - &mut summary, - &mut items, + summary, + items, options, &runtime, - &mut ftx, + ftx, chrom_id, ) .await?; BigWigWrite::process_val_zoom( - &mut zoom_items, + zoom_items, options, current_val, &mut data, @@ -934,17 +959,7 @@ impl ChromProcess for BigWigFullProcess { .await?; } - debug_assert!(items.is_empty()); - for zoom_item in zoom_items.iter_mut() { - debug_assert!(zoom_item.live_info.is_none()); - debug_assert!(zoom_item.records.is_empty()); - } - - if summary.total_items == 0 { - summary.min_val = 0.0; - summary.max_val = 0.0; - } - Ok(ChromProcessedData(summary)) + Ok(()) } } diff --git a/bigtools/src/utils/cli/bigwigmerge.rs b/bigtools/src/utils/cli/bigwigmerge.rs index 5dcb1ef..2cbe510 100644 --- a/bigtools/src/utils/cli/bigwigmerge.rs +++ b/bigtools/src/utils/cli/bigwigmerge.rs @@ -396,7 +396,7 @@ impl ChromData2 for ChromGroupReadImpl { fn process_to_bbi< P: ChromProcess::Value>, StartProcessing: FnMut(String) -> Result::Error>>, - Advance: FnMut(crate::ChromProcessedData), + Advance: FnMut(P), >( &mut self, runtime: &runtime::Handle, @@ -408,10 +408,10 @@ impl ChromData2 for ChromGroupReadImpl { self.iter.next(); match next { Some(Ok((chrom, _, group))) => { - let p = start_processing(chrom)?; + let mut p = start_processing(chrom)?; let read = p.do_process(group); - let data = runtime.block_on(read)?; - advance(data); + runtime.block_on(read)?; + advance(p); } Some(Err(e)) => return Err(ProcessChromError::SourceError(e)), None => break, From 4d527c2b4de43562ebffe1539a18257a731fa891 Mon Sep 17 00:00:00 2001 From: Jack Huey <31162821+jackh726@users.noreply.github.com> Date: Fri, 19 Apr 2024 01:05:15 -0400 Subject: [PATCH 08/31] It works --- bigtools/src/bbi/bbiwrite.rs | 22 +++-- bigtools/src/bbi/bedchromdata.rs | 35 ++++++-- bigtools/src/bbi/bigbedwrite.rs | 120 +++++++++++++------------- bigtools/src/bbi/bigwigwrite.rs | 87 +++++++++---------- bigtools/src/utils/cli/bigwigmerge.rs | 17 +++- 5 files changed, 163 insertions(+), 118 deletions(-) diff --git a/bigtools/src/bbi/bbiwrite.rs b/bigtools/src/bbi/bbiwrite.rs index 8ceb1a6..2077efb 100644 --- a/bigtools/src/bbi/bbiwrite.rs +++ b/bigtools/src/bbi/bbiwrite.rs @@ -721,14 +721,22 @@ pub struct InternalProcessData( pub(crate) u32, ); -pub(crate) trait ChromProcess { - type Value; - fn create(internal_data: InternalProcessData) -> Self; - fn destroy(self) -> ChromProcessedData; - async fn do_process>( +pub mod process_internal { + use super::*; + + pub trait ChromProcessCreate { + fn create(internal_data: InternalProcessData) -> Self; + fn destroy(self) -> ChromProcessedData; + } +} + +pub trait ChromProcess: process_internal::ChromProcessCreate { + type Value: Send + 'static; + fn do_process( &mut self, - data: Values, - ) -> Result<(), ProcessChromError>; + current_val: Self::Value, + next_val: Option<&Self::Value>, + ) -> impl Future>>; } pub(crate) fn write_vals< diff --git a/bigtools/src/bbi/bedchromdata.rs b/bigtools/src/bbi/bedchromdata.rs index 1f917e6..46dc22a 100644 --- a/bigtools/src/bbi/bedchromdata.rs +++ b/bigtools/src/bbi/bedchromdata.rs @@ -91,7 +91,7 @@ impl ChromData2 for BedParserStreamingIterator { ) -> Result<(), ProcessChromError<::Error>> { loop { match self.bed_data.next_chrom() { - Some(Ok((chrom, group))) => { + Some(Ok((chrom, mut group))) => { // First, if we don't want to allow out of order chroms, error here let last = self.last_chrom.replace(chrom.clone()); if let Some(c) = last { @@ -102,8 +102,19 @@ impl ChromData2 for BedParserStreamingIterator { } let mut p = start_processing(chrom)?; - let read = p.do_process(group); - runtime.block_on(read)?; + + while let Some(current_val) = group.next() { + // If there is a source error, propogate that up + let current_val = current_val.map_err(ProcessChromError::SourceError)?; + let next_val = match group.peek() { + None | Some(Err(_)) => None, + Some(Ok(v)) => Some(v), + }; + + let read = p.do_process(current_val, next_val); + runtime.block_on(read)?; + } + advance(p); } Some(Err(e)) => return Err(ProcessChromError::SourceError(e)), @@ -234,7 +245,7 @@ impl ChromData2 runtime: &Handle, start_processing: &mut StartProcessing, advance: &mut Advance, - ) -> Result<(), ProcessChromError<::Error>> { + ) -> Result<(), ProcessChromError> { let mut remaining = true; let mut queued_reads: VecDeque<_> = VecDeque::new(); loop { @@ -258,7 +269,7 @@ impl ChromData2 }); match parser.next_chrom() { - Some(Ok((chrom, group))) => { + Some(Ok((chrom, mut group))) => { let last = self.last_chrom.replace(chrom.clone()); if let Some(c) = last { // TODO: test this correctly fails @@ -268,10 +279,22 @@ impl ChromData2 } let mut p = start_processing(chrom)?; + let runtime_handle = runtime.clone(); let data: tokio::task::JoinHandle< Result>, > = runtime.spawn(async move { - p.do_process(group).await?; + while let Some(current_val) = group.next() { + // If there is a source error, propogate that up + let current_val = + current_val.map_err(ProcessChromError::SourceError)?; + let next_val = match group.peek() { + None | Some(Err(_)) => None, + Some(Ok(v)) => Some(v), + }; + + let read = p.do_process(current_val, next_val); + runtime_handle.block_on(read)?; + } Ok(p) }); queued_reads.push_back(data); diff --git a/bigtools/src/bbi/bigbedwrite.rs b/bigtools/src/bbi/bigbedwrite.rs index 034b79b..567a23d 100644 --- a/bigtools/src/bbi/bigbedwrite.rs +++ b/bigtools/src/bbi/bigbedwrite.rs @@ -10,6 +10,7 @@ use futures::sink::SinkExt; use byteorder::{NativeEndian, WriteBytesExt}; use tokio::runtime::{Handle, Runtime}; +use crate::bbiwrite::process_internal::ChromProcessCreate; use crate::utils::chromvalues::ChromValues; use crate::utils::indexlist::IndexList; use crate::utils::tell::Tell; @@ -239,11 +240,11 @@ impl BigBedWrite { Ok(()) } - async fn process_val>( + async fn process_val( current_val: BedEntry, + next_val: Option<&BedEntry>, chrom_length: u32, chrom: &String, - chrom_values: &mut I, summary: &mut Option, items: &mut Vec, overlap: &mut IndexList, @@ -251,7 +252,7 @@ impl BigBedWrite { runtime: &Handle, ftx: &mut ChromProcessingInputSectionChannel, chrom_id: u32, - ) -> Result<(), ProcessChromError> { + ) -> Result<(), ProcessChromError> { // Check a few preconditions: // - The current end is greater than or equal to the start // - The current end is at most the chromosome length @@ -269,9 +270,9 @@ impl BigBedWrite { current_val.start, chrom, chrom_length ))); } - match chrom_values.peek() { - None | Some(Err(_)) => (), - Some(Ok(next_val)) => { + match next_val { + None => (), + Some(next_val) => { if current_val.start > next_val.start { return Err(ProcessChromError::InvalidInput(format!( "Invalid bed: not sorted on chromosome {} at {}-{} (first) and {}-{} (second). Use sort -k1,1 -k2,2n to sort the bed before input.", @@ -383,12 +384,12 @@ impl BigBedWrite { summary, current_val.start, current_val.end, - chrom_values.peek().and_then(|v| v.ok()).map(|v| v.start), + next_val.map(|v| v.start), ); // Then, add the current item to the actual values, and encode if full, or last item items.push(current_val); - if chrom_values.peek().is_none() || items.len() >= options.items_per_slot as usize { + if next_val.is_none() || items.len() >= options.items_per_slot as usize { let items = std::mem::replace(items, Vec::with_capacity(options.items_per_slot as usize)); let handle = runtime @@ -400,15 +401,15 @@ impl BigBedWrite { Ok(()) } - async fn process_val_zoom>( + async fn process_val_zoom( zoom_items: &mut Vec, options: BBIWriteOptions, item_start: u32, item_end: u32, - chrom_values: &mut I, + next_val: Option<&BedEntry>, runtime: &Handle, chrom_id: u32, - ) -> Result<(), ProcessChromError> { + ) -> Result<(), ProcessChromError> { // Then, add the item to the zoom item queues. This is a bit complicated. for zoom_item in zoom_items.iter_mut() { debug_assert_ne!(zoom_item.records.len(), options.items_per_slot as usize); @@ -452,11 +453,7 @@ impl BigBedWrite { }); } - let next_start = chrom_values - .peek() - .and_then(|v| v.ok()) - .map(|v| v.start) - .unwrap_or(u32::max_value()); + let next_start = next_val.map(|v| v.start).unwrap_or(u32::max_value()); while overlap .head() @@ -477,7 +474,7 @@ impl BigBedWrite { let mut add_start = removed_start; loop { if add_start >= removed_end { - if chrom_values.peek().is_none() { + if next_val.is_none() { if let Some((mut zoom2, total_items)) = zoom_item.live_info.take() { zoom2.summary.total_items = total_items; zoom_item.records.push(zoom2); @@ -591,8 +588,7 @@ pub(crate) struct BigBedFullProcess { length: u32, } -impl ChromProcess for BigBedFullProcess { - type Value = BedEntry; +impl ChromProcessCreate for BigBedFullProcess { fn destroy(self) -> ChromProcessedData { let Self { summary, @@ -655,10 +651,14 @@ impl ChromProcess for BigBedFullProcess { length, } } - async fn do_process>( +} +impl ChromProcess for BigBedFullProcess { + type Value = BedEntry; + async fn do_process( &mut self, - mut data: Values, - ) -> Result<(), ProcessChromError> { + current_val: Self::Value, + next_val: Option<&Self::Value>, + ) -> Result<(), ProcessChromError> { let Self { summary, total_items, @@ -673,40 +673,36 @@ impl ChromProcess for BigBedFullProcess { let chrom_id = *chrom_id; let length = *length; - while let Some(current_val) = data.next() { - // If there is a source error, propogate that up - let current_val = current_val.map_err(ProcessChromError::SourceError)?; - *total_items += 1; - - let item_start = current_val.start; - let item_end = current_val.end; - - BigBedWrite::process_val( - current_val, - length, - &chrom, - &mut data, - summary, - &mut state_val.items, - &mut state_val.overlap, - *options, - &runtime, - ftx, - chrom_id, - ) - .await?; - - BigBedWrite::process_val_zoom( - &mut state_val.zoom_items, - *options, - item_start, - item_end, - &mut data, - &runtime, - chrom_id, - ) - .await?; - } + *total_items += 1; + + let item_start = current_val.start; + let item_end = current_val.end; + + BigBedWrite::process_val( + current_val, + next_val, + length, + &chrom, + summary, + &mut state_val.items, + &mut state_val.overlap, + *options, + &runtime, + ftx, + chrom_id, + ) + .await?; + + BigBedWrite::process_val_zoom( + &mut state_val.zoom_items, + *options, + item_start, + item_end, + next_val, + &runtime, + chrom_id, + ) + .await?; Ok(()) } @@ -745,6 +741,10 @@ pub(crate) async fn process_chrom_no_zooms>( while let Some(current_val) = chrom_values.next() { // If there is a source error, propogate that up let current_val = current_val.map_err(ProcessChromError::SourceError)?; + let next_val = match chrom_values.peek() { + Some(Ok(v)) => Some(v), + _ => None, + }; total_items += 1; let item_start = current_val.start; @@ -752,9 +752,9 @@ pub(crate) async fn process_chrom_no_zooms>( BigBedWrite::process_val( current_val, + next_val, chrom_length, &chrom, - &mut chrom_values, &mut summary, &mut items, &mut overlap, @@ -821,6 +821,10 @@ pub(crate) async fn process_chrom_zoom>( while let Some(current_val) = chrom_values.next() { // If there is a source error, propogate that up let current_val = current_val.map_err(ProcessChromError::SourceError)?; + let next_val = match chrom_values.peek() { + Some(Ok(v)) => Some(v), + _ => None, + }; let item_start = current_val.start; let item_end = current_val.end; @@ -830,7 +834,7 @@ pub(crate) async fn process_chrom_zoom>( options, item_start, item_end, - &mut chrom_values, + next_val, &runtime, chrom_id, ) diff --git a/bigtools/src/bbi/bigwigwrite.rs b/bigtools/src/bbi/bigwigwrite.rs index 39cb2b9..212b087 100644 --- a/bigtools/src/bbi/bigwigwrite.rs +++ b/bigtools/src/bbi/bigwigwrite.rs @@ -54,6 +54,7 @@ use futures::sink::SinkExt; use byteorder::{NativeEndian, WriteBytesExt}; use tokio::runtime::{Handle, Runtime}; +use crate::bbiwrite::process_internal::ChromProcessCreate; use crate::utils::chromvalues::ChromValues; use crate::utils::idmap::IdMap; use crate::utils::tell::Tell; @@ -558,7 +559,7 @@ impl BigWigWrite { async fn process_val( current_val: Value, - next_val: Option, + next_val: Option<&Value>, chrom_length: u32, chrom: &String, summary: &mut Summary, @@ -622,14 +623,14 @@ impl BigWigWrite { Ok(()) } - async fn process_val_zoom>( + async fn process_val_zoom( zoom_items: &mut Vec, options: BBIWriteOptions, current_val: Value, - chrom_values: &mut I, + next_val: Option<&Value>, runtime: &Handle, chrom_id: u32, - ) -> Result<(), ProcessChromError> { + ) -> Result<(), ProcessChromError> { // Then, add the item to the zoom item queues. This is a bit complicated. for zoom_item in zoom_items.iter_mut() { debug_assert_ne!(zoom_item.records.len(), options.items_per_slot as usize); @@ -644,7 +645,7 @@ impl BigWigWrite { // Write section if full; or if no next section, some items, and no current zoom record if (add_start >= current_val.end && zoom_item.live_info.is_none() - && chrom_values.peek().is_none() + && next_val.is_none() && !zoom_item.records.is_empty()) || zoom_item.records.len() == options.items_per_slot as usize { @@ -659,7 +660,7 @@ impl BigWigWrite { .expect("Couln't send"); } if add_start >= current_val.end { - if chrom_values.peek().is_none() { + if next_val.is_none() { if let Some(zoom2) = zoom_item.live_info.take() { zoom_item.records.push(zoom2); continue; @@ -750,7 +751,7 @@ impl BigWigWrite { let current_val = current_val.map_err(ProcessChromError::SourceError)?; let next_val = match chrom_values.peek() { None | Some(Err(_)) => None, - Some(Ok(v)) => Some(*v), + Some(Ok(v)) => Some(v), }; BigWigWrite::process_val( @@ -814,12 +815,16 @@ impl BigWigWrite { while let Some(current_val) = chrom_values.next() { // If there is a source error, propogate that up let current_val = current_val.map_err(ProcessChromError::SourceError)?; + let next_val = match chrom_values.peek() { + None | Some(Err(_)) => None, + Some(Ok(v)) => Some(v), + }; BigWigWrite::process_val_zoom( &mut zoom_items, options, current_val, - &mut chrom_values, + next_val, &runtime, chrom_id, ) @@ -848,8 +853,7 @@ pub(crate) struct BigWigFullProcess { length: u32, } -impl ChromProcess for BigWigFullProcess { - type Value = Value; +impl ChromProcessCreate for BigWigFullProcess { fn create(internal_data: InternalProcessData) -> Self { let InternalProcessData(zooms_channels, ftx, chrom_id, options, runtime, chrom, length) = internal_data; @@ -906,11 +910,15 @@ impl ChromProcess for BigWigFullProcess { } ChromProcessedData(summary) } +} - async fn do_process>( +impl ChromProcess for BigWigFullProcess { + type Value = Value; + async fn do_process( &mut self, - mut data: Values, - ) -> Result<(), ProcessChromError> { + current_val: Value, + next_val: Option<&Value>, + ) -> Result<(), ProcessChromError> { let Self { summary, items, @@ -926,38 +934,29 @@ impl ChromProcess for BigWigFullProcess { let options = *options; let length = *length; - while let Some(current_val) = data.next() { - // If there is a source error, propogate that up - let current_val = current_val.map_err(ProcessChromError::SourceError)?; - let next_val = match data.peek() { - None | Some(Err(_)) => None, - Some(Ok(v)) => Some(*v), - }; - - BigWigWrite::process_val( - current_val, - next_val, - length, - &chrom, - summary, - items, - options, - &runtime, - ftx, - chrom_id, - ) - .await?; + BigWigWrite::process_val( + current_val, + next_val, + length, + &chrom, + summary, + items, + options, + &runtime, + ftx, + chrom_id, + ) + .await?; - BigWigWrite::process_val_zoom( - zoom_items, - options, - current_val, - &mut data, - &runtime, - chrom_id, - ) - .await?; - } + BigWigWrite::process_val_zoom( + zoom_items, + options, + current_val, + next_val, + &runtime, + chrom_id, + ) + .await?; Ok(()) } diff --git a/bigtools/src/utils/cli/bigwigmerge.rs b/bigtools/src/utils/cli/bigwigmerge.rs index 2cbe510..fb2dc82 100644 --- a/bigtools/src/utils/cli/bigwigmerge.rs +++ b/bigtools/src/utils/cli/bigwigmerge.rs @@ -407,10 +407,21 @@ impl ChromData2 for ChromGroupReadImpl { let next: Option> = self.iter.next(); match next { - Some(Ok((chrom, _, group))) => { + Some(Ok((chrom, _, mut group))) => { let mut p = start_processing(chrom)?; - let read = p.do_process(group); - runtime.block_on(read)?; + + while let Some(current_val) = group.next() { + // If there is a source error, propogate that up + let current_val = current_val.map_err(ProcessChromError::SourceError)?; + let next_val = match group.peek() { + None | Some(Err(_)) => None, + Some(Ok(v)) => Some(v), + }; + + let read = p.do_process(current_val, next_val); + runtime.block_on(read)?; + } + advance(p); } Some(Err(e)) => return Err(ProcessChromError::SourceError(e)), From cb2c5cd6c1a97acca420e8d12984c0a6987e01cf Mon Sep 17 00:00:00 2001 From: Jack Huey <31162821+jackh726@users.noreply.github.com> Date: Fri, 19 Apr 2024 21:06:02 -0400 Subject: [PATCH 09/31] Problems --- bigtools/src/bbi/bbiwrite.rs | 308 ++++------ bigtools/src/bbi/bedchromdata.rs | 226 ++----- bigtools/src/bbi/bigbedwrite.rs | 307 ++++++---- bigtools/src/bbi/bigwigwrite.rs | 681 +++++++-------------- bigtools/src/utils/cli/bedgraphtobigwig.rs | 2 +- bigtools/src/utils/cli/bigwigmerge.rs | 43 +- bigtools/tests/bigwigwrite.rs | 3 +- 7 files changed, 597 insertions(+), 973 deletions(-) diff --git a/bigtools/src/bbi/bbiwrite.rs b/bigtools/src/bbi/bbiwrite.rs index 2077efb..edeeada 100644 --- a/bigtools/src/bbi/bbiwrite.rs +++ b/bigtools/src/bbi/bbiwrite.rs @@ -553,45 +553,19 @@ pub enum ChromDataState { Error(Error), } -/// An opaque key to indicate an processing chromosome -pub struct ChromProcessingKey(pub(crate) u32); - pub struct ChromProcessedData(pub(crate) Summary); /// Effectively like an Iterator of chromosome data pub trait ChromData: Sized { type Values: ChromValues; - fn advance< - State, - F: FnMut( - String, - Self::Values, - &mut State, - ) -> Result< - ChromProcessingKey, - ProcessChromError<::Error>, - >, - >( - &mut self, - do_read: &mut F, - state: &mut State, - ) -> Result< - ChromDataState::Error>, - ProcessChromError<::Error>, - >; -} - -pub trait ChromData2: Sized { - type Values: ChromValues; - fn process_to_bbi< P: ChromProcess::Value> + Send + 'static, StartProcessing: FnMut(String) -> Result::Error>>, - Advance: FnMut(P), + Advance: FnMut(P) -> Result<(), ProcessChromError<::Error>>, >( &mut self, - runtime: &Handle, + runtime: &Runtime, start_processing: &mut StartProcessing, advance: &mut Advance, ) -> Result<(), ProcessChromError<::Error>>; @@ -722,11 +696,11 @@ pub struct InternalProcessData( ); pub mod process_internal { - use super::*; - pub trait ChromProcessCreate { - fn create(internal_data: InternalProcessData) -> Self; - fn destroy(self) -> ChromProcessedData; + type I; + type Out; + fn create(internal_data: Self::I) -> Self; + fn destroy(self) -> Self::Out; } } @@ -736,13 +710,16 @@ pub trait ChromProcess: process_internal::ChromProcessCreate { &mut self, current_val: Self::Value, next_val: Option<&Self::Value>, - ) -> impl Future>>; + ) -> impl Future>> + Send; } pub(crate) fn write_vals< Values: ChromValues, - V: ChromData2, - P: ChromProcess + Send + 'static, + V: ChromData, + P: ChromProcess + + process_internal::ChromProcessCreate + + Send + + 'static, >( mut vals_iter: V, file: BufWriter, @@ -780,8 +757,6 @@ pub(crate) fn write_vals< let (write_fut, write_fut_handle) = write_fut.remote_handle(); runtime.spawn(write_fut); - let handle = runtime.handle(); - let setup_chrom = || { let (ftx, sections_handle, buf, section_receiver) = future_channel(options.channel_size, runtime.handle(), options.inmemory); @@ -856,9 +831,10 @@ pub(crate) fn write_vals< summary.sum_squares += chrom_summary.sum_squares; } } + Ok(()) }; - vals_iter.process_to_bbi(handle, &mut do_read, &mut advance)?; + vals_iter.process_to_bbi(&runtime, &mut do_read, &mut advance)?; drop(send); @@ -897,26 +873,29 @@ pub(crate) fn write_vals< )) } +pub(crate) struct NoZoomsInternalProcessData( + pub(crate) ChromProcessingInputSectionChannel, + pub(crate) u32, + pub(crate) BBIWriteOptions, + pub(crate) Handle, + pub(crate) String, + pub(crate) u32, +); +pub(crate) struct NoZoomsInternalProcessedData(pub(crate) Summary, pub(crate) Vec<(u64, u64)>); + pub(crate) fn write_vals_no_zoom< Values: ChromValues, V: ChromData, - Fut: Future), ProcessChromError>> - + Send + P: ChromProcess + + process_internal::ChromProcessCreate< + I = NoZoomsInternalProcessData, + Out = NoZoomsInternalProcessedData, + > + Send + 'static, - G: Fn( - ChromProcessingInputSectionChannel, - u32, - BBIWriteOptions, - Handle, - Values, - String, - u32, - ) -> Fut, >( mut vals_iter: V, file: BufWriter, options: BBIWriteOptions, - process_chrom: G, runtime: &Runtime, chrom_sizes: HashMap, ) -> Result< @@ -937,12 +916,11 @@ pub(crate) fn write_vals_no_zoom< let mut chrom_ids = IdMap::default(); - let mut key = 0; - let mut output: BTreeMap = BTreeMap::new(); - let mut summary: Option = None; let (send, recv) = futures_mpsc::unbounded(); let write_fut = write_chroms_without_zooms::(file, recv); + let (write_fut, write_fut_handle) = write_fut.remote_handle(); + runtime.spawn(write_fut); let setup_chrom = || { let (ftx, sections_handle, buf, section_receiver) = @@ -955,10 +933,7 @@ pub(crate) fn write_vals_no_zoom< ftx }; - let mut do_read = |chrom: String, - data: _, - output: &mut BTreeMap| - -> Result> { + let mut do_read = |chrom: String| -> Result<_, ProcessChromError<_>> { let length = match chrom_sizes.get(&chrom) { Some(length) => *length, None => { @@ -973,54 +948,43 @@ pub(crate) fn write_vals_no_zoom< let ftx = setup_chrom(); - let fut = process_chrom( + let internal_data = NoZoomsInternalProcessData( ftx, chrom_id, options, runtime.handle().clone(), - data, chrom, length, ); + Ok(P::create(internal_data)) + }; - let curr_key = key; - key += 1; + let mut advance = |p: P| { + let data = p.destroy(); + let NoZoomsInternalProcessedData(chrom_summary, zoom_counts) = data; - output.insert(curr_key, fut); + match &mut summary { + None => summary = Some(chrom_summary), + Some(summary) => { + summary.total_items += chrom_summary.total_items; + summary.bases_covered += chrom_summary.bases_covered; + summary.min_val = summary.min_val.min(chrom_summary.min_val); + summary.max_val = summary.max_val.max(chrom_summary.max_val); + summary.sum += chrom_summary.sum; + summary.sum_squares += chrom_summary.sum_squares; + } + } - Ok(ChromProcessingKey(curr_key)) + let zoom_count_map = BTreeMap::from_iter(zoom_counts.into_iter()); + for zoom_count in total_zoom_counts.iter_mut() { + let chrom_zoom_count = zoom_count_map.get(&zoom_count.0).copied().unwrap_or(1); + *zoom_count.1 += chrom_zoom_count; + } + Ok(()) }; - let (write_fut, write_fut_handle) = write_fut.remote_handle(); - runtime.spawn(write_fut); - loop { - match vals_iter.advance(&mut do_read, &mut output)? { - ChromDataState::NewChrom(read) => { - let fut = output.remove(&read.0).unwrap(); - let (chrom_summary, zoom_counts) = runtime.block_on(fut)?; - - match &mut summary { - None => summary = Some(chrom_summary), - Some(summary) => { - summary.total_items += chrom_summary.total_items; - summary.bases_covered += chrom_summary.bases_covered; - summary.min_val = summary.min_val.min(chrom_summary.min_val); - summary.max_val = summary.max_val.max(chrom_summary.max_val); - summary.sum += chrom_summary.sum; - summary.sum_squares += chrom_summary.sum_squares; - } - } + vals_iter.process_to_bbi(&runtime, &mut do_read, &mut advance)?; - let zoom_count_map = BTreeMap::from_iter(zoom_counts.into_iter()); - for zoom_count in total_zoom_counts.iter_mut() { - let chrom_zoom_count = zoom_count_map.get(&zoom_count.0).copied().unwrap_or(1); - *zoom_count.1 += chrom_zoom_count; - } - } - ChromDataState::Finished => break, - ChromDataState::Error(err) => return Err(ProcessChromError::SourceError(err)), - } - } drop(send); let summary_complete = summary.unwrap_or(Summary { @@ -1045,15 +1009,43 @@ pub(crate) fn write_vals_no_zoom< )) } +// Zooms have to be double-buffered: first because chroms could be processed in parallel and second because we don't know the offset of each zoom immediately +type InternalZoomValue = ( + Vec>, + TempFileBuffer>, + Option>>, +); + +pub(crate) struct InternalTempZoomInfo { + pub resolution: u32, + pub data_write_future: Box< + dyn Future>> + Send + Unpin, + >, + pub data: TempFileBuffer>>, + pub sections: crossbeam_channel::Receiver
, +} + +pub(crate) struct ZoomsInternalProcessData( + pub(crate) Vec>, + pub(crate) Vec<(u32, ChromProcessingInputSectionChannel)>, + pub(crate) u32, + pub(crate) BBIWriteOptions, + pub(crate) Handle, +); +pub(crate) struct ZoomsInternalProcessedData(pub(crate) Vec>); + pub(crate) fn write_zoom_vals< Values: ChromValues, V: ChromData, - Fut: Future>> + Send + 'static, - G: Fn(Vec<(u32, ChromProcessingInputSectionChannel)>, u32, BBIWriteOptions, Handle, Values) -> Fut, + P: ChromProcess + + process_internal::ChromProcessCreate< + I = ZoomsInternalProcessData, + Out = ZoomsInternalProcessedData, + > + Send + + 'static, >( mut vals_iter: V, options: BBIWriteOptions, - process_chrom_zoom: G, runtime: &Runtime, chrom_ids: &HashMap, average_size: u32, @@ -1061,26 +1053,8 @@ pub(crate) fn write_zoom_vals< mut file: BufWriter, data_size: u64, ) -> Result<(BufWriter, Vec, usize), ProcessChromError> { - // Zooms have to be double-buffered: first because chroms could be processed in parallel and second because we don't know the offset of each zoom immediately - type ZoomValue = ( - Vec>, - TempFileBuffer>, - Option>>, - ); - - pub(crate) struct TempZoomInfo { - pub resolution: u32, - pub data_write_future: Box< - dyn Future>> - + Send - + Unpin, - >, - pub data: TempFileBuffer>>, - pub sections: crossbeam_channel::Receiver
, - } - let min_first_zoom_size = average_size.max(10) * 4; - let mut zooms_map: BTreeMap = zoom_counts + let mut zooms_map: BTreeMap = zoom_counts .into_iter() .skip_while(|z| z.0 > min_first_zoom_size as u64) .skip_while(|z| { @@ -1109,13 +1083,7 @@ pub(crate) fn write_zoom_vals< let mut max_uncompressed_buf_size = 0; - let mut key = 0; - let mut output: BTreeMap = BTreeMap::new(); - - let mut do_read = |chrom: String, - data: _, - output: &mut BTreeMap| - -> Result> { + let mut do_read = |chrom: String| -> Result> { // Make a new id for the chromosome let chrom_id = *chrom_ids .get(&chrom) @@ -1128,7 +1096,7 @@ pub(crate) fn write_zoom_vals< for size in resolutions.iter().copied() { let (ftx, handle, buf, section_receiver) = future_channel(options.channel_size, runtime.handle(), options.inmemory); - let zoom_info = TempZoomInfo { + let zoom_info = InternalTempZoomInfo { resolution: size, data_write_future: Box::new(handle), data: buf, @@ -1140,70 +1108,58 @@ pub(crate) fn write_zoom_vals< (zoom_infos, zooms_channels) }; - let (f_remote, f_handle) = process_chrom_zoom( + let internal_data = ZoomsInternalProcessData( + zoom_infos, zooms_channels, chrom_id, options, runtime.handle().clone(), + ); + Ok(P::create(internal_data)) + }; + + let mut advance = |p: P| { + let data = p.destroy(); + let ZoomsInternalProcessedData(mut zooms) = data; + + // For each zoom, switch the current chromosome to write to the actual zoom file + for InternalTempZoomInfo { + resolution: size, data, - ) - .remote_handle(); - runtime.spawn(f_remote); + .. + } in zooms.iter_mut() + { + let zoom = zooms_map.get_mut(size).unwrap(); + let writer = zoom.2.take().unwrap(); + data.switch(writer); + } - let curr_key = key; - key += 1; + for InternalTempZoomInfo { + resolution, + data_write_future, + data, + sections, + } in zooms.into_iter() + { + // First, we need to make sure that all the sections that were queued to encode have been written + let data_write_data = runtime.block_on(data_write_future); + let (_num_sections, uncompressed_buf_size) = match data_write_data { + Ok(d) => d, + Err(e) => return Err(e), + }; + max_uncompressed_buf_size = max_uncompressed_buf_size.max(uncompressed_buf_size); - output.insert(curr_key, (f_handle, zoom_infos)); + let zoom = zooms_map.get_mut(&resolution).unwrap(); + // Add the section data to the zoom + zoom.0.push(sections.into_iter()); + // Replace the zoom file again + zoom.2.replace(data.await_real_file()); + } - Ok(ChromProcessingKey(curr_key)) + Ok(()) }; - loop { - match vals_iter.advance(&mut do_read, &mut output)? { - ChromDataState::NewChrom(read) => { - let read = output.remove(&read.0).unwrap(); - let (process_future, mut zooms) = read; - // For each zoom, switch the current chromosome to write to the actual zoom file - for TempZoomInfo { - resolution: size, - data, - .. - } in zooms.iter_mut() - { - let zoom = zooms_map.get_mut(size).unwrap(); - let writer = zoom.2.take().unwrap(); - data.switch(writer); - } - - runtime.block_on(process_future)?; - - for TempZoomInfo { - resolution, - data_write_future, - data, - sections, - } in zooms.into_iter() - { - // First, we need to make sure that all the sections that were queued to encode have been written - let data_write_data = runtime.block_on(data_write_future); - let (_num_sections, uncompressed_buf_size) = match data_write_data { - Ok(d) => d, - Err(e) => return Err(e), - }; - max_uncompressed_buf_size = - max_uncompressed_buf_size.max(uncompressed_buf_size); - - let zoom = zooms_map.get_mut(&resolution).unwrap(); - // Add the section data to the zoom - zoom.0.push(sections.into_iter()); - // Replace the zoom file again - zoom.2.replace(data.await_real_file()); - } - } - ChromDataState::Finished => break, - ChromDataState::Error(err) => return Err(ProcessChromError::SourceError(err)), - } - } + vals_iter.process_to_bbi(&runtime, &mut do_read, &mut advance)?; let mut zoom_entries = Vec::with_capacity(zooms_map.len()); let mut zooms_map_iter = zooms_map.into_iter(); diff --git a/bigtools/src/bbi/bedchromdata.rs b/bigtools/src/bbi/bedchromdata.rs index 46dc22a..e0ecdb6 100644 --- a/bigtools/src/bbi/bedchromdata.rs +++ b/bigtools/src/bbi/bedchromdata.rs @@ -12,16 +12,14 @@ use std::fs::File; use std::io::{BufReader, Seek, SeekFrom}; use std::path::PathBuf; -use tokio::runtime::Handle; +use tokio::runtime::Runtime; use crate::bed::bedparser::{ BedChromData, BedFileStream, BedParser, BedValueError, Parser, StateValue, StreamingBedValues, }; use crate::utils::chromvalues::ChromValues; use crate::utils::streaming_linereader::StreamingLineReader; -use crate::{ - ChromData, ChromData2, ChromDataState, ChromProcess, ChromProcessingKey, ProcessChromError, -}; +use crate::{ChromData, ChromProcess, ProcessChromError}; pub struct BedParserStreamingIterator { bed_data: BedParser, @@ -42,50 +40,13 @@ impl BedParserStreamingIterator { impl ChromData for BedParserStreamingIterator { type Values = BedChromData; - /// Advancing after `ChromDataState::Finished` has been called will result in a panic. - fn advance< - State, - F: FnMut( - String, - BedChromData, - &mut State, - ) -> Result>, - >( - &mut self, - do_read: &mut F, - state: &mut State, - ) -> Result, ProcessChromError> - { - Ok(match self.bed_data.next_chrom() { - Some(Ok((chrom, group))) => { - // First, if we don't want to allow out of order chroms, error here - let last = self.last_chrom.replace(chrom.clone()); - if let Some(c) = last { - // TODO: test this correctly fails - if !self.allow_out_of_order_chroms && c >= chrom { - return Ok(ChromDataState::Error(BedValueError::InvalidInput("Input bedGraph not sorted by chromosome. Sort with `sort -k1,1 -k2,2n`.".to_string()))); - } - } - - let read = do_read(chrom, group, state)?; - ChromDataState::NewChrom(read) - } - Some(Err(e)) => ChromDataState::Error(e), - None => ChromDataState::Finished, - }) - } -} - -impl ChromData2 for BedParserStreamingIterator { - type Values = BedChromData; - fn process_to_bbi< P: ChromProcess::Value>, StartProcessing: FnMut(String) -> Result::Error>>, - Advance: FnMut(P), + Advance: FnMut(P) -> Result<(), ProcessChromError<::Error>>, >( &mut self, - runtime: &Handle, + runtime: &Runtime, start_processing: &mut StartProcessing, advance: &mut Advance, ) -> Result<(), ProcessChromError<::Error>> { @@ -115,7 +76,7 @@ impl ChromData2 for BedParserStreamingIterator { runtime.block_on(read)?; } - advance(p); + advance(p)?; } Some(Err(e)) => return Err(ProcessChromError::SourceError(e)), None => break, @@ -126,18 +87,16 @@ impl ChromData2 for BedParserStreamingIterator { } } -pub struct BedParserParallelStreamingIterator { +pub struct BedParserParallelStreamingIterator { allow_out_of_order_chroms: bool, last_chrom: Option, chrom_indices: Vec<(u64, String)>, parse_fn: Parser, path: PathBuf, - - queued_reads: VecDeque, E>>, } -impl BedParserParallelStreamingIterator { +impl BedParserParallelStreamingIterator { pub fn new( mut chrom_indices: Vec<(u64, String)>, allow_out_of_order_chroms: bool, @@ -155,94 +114,20 @@ impl BedParserParallelStreamingIterator { chrom_indices, parse_fn, path, - - queued_reads: VecDeque::new(), - } - } -} - -impl ChromData - for BedParserParallelStreamingIterator, BedValueError> -{ - type Values = BedChromData>>; - - fn advance< - State, - F: FnMut( - String, - BedChromData>>, - &mut State, - ) -> Result>, - >( - &mut self, - do_read: &mut F, - state: &mut State, - ) -> Result, ProcessChromError> - { - let mut begin_next = |_self: &mut Self| -> Result<_, ProcessChromError> { - let curr = match _self.chrom_indices.pop() { - Some(c) => c, - None => { - return Ok(ChromDataState::<_, BedValueError>::Finished); - } - }; - - let mut file = match File::open(&_self.path) { - Ok(f) => f, - Err(err) => return Ok(ChromDataState::Error(err.into())), - }; - file.seek(SeekFrom::Start(curr.0))?; - let mut parser = BedParser::new(BedFileStream { - bed: StreamingLineReader::new(BufReader::new(file)), - parse: _self.parse_fn, - }); - - Ok(match parser.next_chrom() { - Some(Ok((chrom, group))) => { - let last = _self.last_chrom.replace(chrom.clone()); - if let Some(c) = last { - // TODO: test this correctly fails - if !_self.allow_out_of_order_chroms && c >= chrom { - return Ok(ChromDataState::Error(BedValueError::InvalidInput("Input bedGraph not sorted by chromosome. Sort with `sort -k1,1 -k2,2n`.".to_string()))); - } - } - - let read = do_read(chrom, group, state)?; - - ChromDataState::NewChrom(read) - } - Some(Err(e)) => ChromDataState::Error(e), - None => { - panic!("Unexpected end of file") - } - }) - }; - - while self.queued_reads.len() < (4 + 1) - && matches!( - self.queued_reads.back(), - None | Some(Ok(ChromDataState::NewChrom(..))) - ) - { - let next = begin_next(self); - self.queued_reads.push_back(next); } - self.queued_reads.pop_front().unwrap() } } -impl ChromData2 - for BedParserParallelStreamingIterator, BedValueError> -{ +impl ChromData for BedParserParallelStreamingIterator { type Values = BedChromData>>; fn process_to_bbi< P: ChromProcess::Value> + Send + 'static, StartProcessing: FnMut(String) -> Result::Error>>, - Advance: FnMut(P), + Advance: FnMut(P) -> Result<(), ProcessChromError<::Error>>, >( &mut self, - runtime: &Handle, + runtime: &Runtime, start_processing: &mut StartProcessing, advance: &mut Advance, ) -> Result<(), ProcessChromError> { @@ -279,7 +164,7 @@ impl ChromData2 } let mut p = start_processing(chrom)?; - let runtime_handle = runtime.clone(); + let runtime_handle = runtime.handle().clone(); let data: tokio::task::JoinHandle< Result>, > = runtime.spawn(async move { @@ -309,7 +194,7 @@ impl ChromData2 break; }; let p = runtime.block_on(next_chrom).unwrap()?; - advance(p); + advance(p)?; } Ok(()) @@ -348,8 +233,8 @@ impl ChromValues for BedChromData { mod tests { use super::*; use crate::bed::bedparser::parse_bedgraph; - use crate::ProcessChromError; - use std::collections::BTreeMap; + use crate::process_internal::ChromProcessCreate; + use crate::{ProcessChromError, Value}; use std::fs::File; use std::io; use std::path::PathBuf; @@ -369,52 +254,45 @@ mod tests { PathBuf::from(dir.clone()), parse_bedgraph, ); - - let mut chrom_ids = crate::utils::idmap::IdMap::default(); - let mut key = 0; - let mut output: BTreeMap = BTreeMap::new(); - let mut do_read = |chrom: String, - _: _, - output: &mut BTreeMap| - -> Result> { - // Make a new id for the chromosome - let chrom_id = chrom_ids.get_id(&chrom); - - let curr_key = key; - key += 1; - - output.insert(curr_key, chrom_id); - - Ok(ChromProcessingKey(curr_key)) + let runtime = tokio::runtime::Builder::new_multi_thread().build().unwrap(); + let mut counts = vec![]; + struct TestChromProcess { + count: usize, + } + impl ChromProcessCreate for TestChromProcess { + type I = (); + type Out = (); + fn create(_: Self::I) -> Self { + TestChromProcess { count: 0 } + } + fn destroy(self) -> Self::Out {} + } + impl ChromProcess for TestChromProcess { + type Value = Value; + async fn do_process( + &mut self, + _current_val: Self::Value, + _next_val: Option<&Self::Value>, + ) -> Result<(), ProcessChromError> { + self.count += 1; + dbg!(self.count); + + Ok(()) + } + } + let mut start_processing = |chrom: String| { + dbg!(chrom); + Ok(TestChromProcess::create(())) + }; + let mut advance = |p: TestChromProcess| { + dbg!(p.count); + counts.push(p.count); + let _ = p.destroy(); + Ok(()) }; - assert!(matches!( - chsi.advance(&mut do_read, &mut output), - Ok(ChromDataState::NewChrom(..)) - )); - assert!(matches!( - chsi.advance(&mut do_read, &mut output), - Ok(ChromDataState::NewChrom(..)) - )); - assert!(matches!( - chsi.advance(&mut do_read, &mut output), - Ok(ChromDataState::NewChrom(..)) - )); - assert!(matches!( - chsi.advance(&mut do_read, &mut output), - Ok(ChromDataState::NewChrom(..)) - )); - assert!(matches!( - chsi.advance(&mut do_read, &mut output), - Ok(ChromDataState::NewChrom(..)) - )); - assert!(matches!( - chsi.advance(&mut do_read, &mut output), - Ok(ChromDataState::NewChrom(..)) - )); - assert!(matches!( - chsi.advance(&mut do_read, &mut output), - Ok(ChromDataState::Finished) - )); + chsi.process_to_bbi(&runtime, &mut start_processing, &mut advance) + .unwrap(); + assert_eq!(counts, vec![]); Ok(()) } diff --git a/bigtools/src/bbi/bigbedwrite.rs b/bigtools/src/bbi/bigbedwrite.rs index 567a23d..ec27a16 100644 --- a/bigtools/src/bbi/bigbedwrite.rs +++ b/bigtools/src/bbi/bigbedwrite.rs @@ -15,8 +15,9 @@ use crate::utils::chromvalues::ChromValues; use crate::utils::indexlist::IndexList; use crate::utils::tell::Tell; use crate::{ - write_info, ChromData, ChromData2, ChromProcess, ChromProcessedData, - ChromProcessingInputSectionChannel, InternalProcessData, + write_info, ChromData, ChromProcess, ChromProcessedData, ChromProcessingInputSectionChannel, + InternalProcessData, InternalTempZoomInfo, NoZoomsInternalProcessData, + NoZoomsInternalProcessedData, ZoomsInternalProcessData, ZoomsInternalProcessedData, }; use crate::bbi::{BedEntry, Summary, Value, ZoomRecord, BIGBED_MAGIC}; @@ -81,7 +82,7 @@ impl BigBedWrite { } /// Write the values from `V` as a bigWig. Will utilize the provided runtime for encoding values and for reading through the values (potentially parallelized by chromosome). - pub fn write, V: ChromData2>( + pub fn write, V: ChromData>( self, chrom_sizes: HashMap, vals: V, @@ -158,34 +159,10 @@ impl BigBedWrite { let vals = make_vals()?; - let runtime_handle = runtime.handle(); - - let process_chrom = |ftx: ChromProcessingInputSectionChannel, - chrom_id: u32, - options: BBIWriteOptions, - runtime: Handle, - chrom_values: Values, - chrom: String, - chrom_length: u32| { - let fut = process_chrom_no_zooms( - ftx, - chrom_id, - options, - runtime, - chrom_values, - chrom, - chrom_length, - ); - let (fut, handle) = fut.remote_handle(); - runtime_handle.spawn(fut); - handle - }; - - let output = bbiwrite::write_vals_no_zoom( + let output = bbiwrite::write_vals_no_zoom::<_, _, BigBedNoZoomsProcess>( vals, file, self.options, - process_chrom, &runtime, chrom_sizes.clone(), ); @@ -204,10 +181,9 @@ impl BigBedWrite { let vals = make_vals()?; - let output = bbiwrite::write_zoom_vals( + let output = bbiwrite::write_zoom_vals::<_, _, BigBedZoomsProcess<_>>( vals, self.options, - process_chrom_zoom, &runtime, &chrom_ids, (summary.bases_covered as f64 / summary.total_items as f64) as u32, @@ -589,6 +565,8 @@ pub(crate) struct BigBedFullProcess { } impl ChromProcessCreate for BigBedFullProcess { + type I = InternalProcessData; + type Out = ChromProcessedData; fn destroy(self) -> ChromProcessedData { let Self { summary, @@ -708,44 +686,112 @@ impl ChromProcess for BigBedFullProcess { } } -pub(crate) async fn process_chrom_no_zooms>( - mut ftx: ChromProcessingInputSectionChannel, +#[derive(Debug, Copy, Clone)] +struct ZoomCounts { + resolution: u64, + current_end: u64, + counts: u64, +} +struct BigBedNoZoomsProcess { + ftx: ChromProcessingInputSectionChannel, chrom_id: u32, options: BBIWriteOptions, runtime: Handle, - mut chrom_values: I, chrom: String, - chrom_length: u32, -) -> Result<(Summary, Vec<(u64, u64)>), ProcessChromError> { - #[derive(Debug, Copy, Clone)] - struct ZoomCounts { - resolution: u64, - current_end: u64, - counts: u64, + length: u32, + + summary: Option, + items: Vec, + overlap: IndexList, + zoom_counts: Vec, + total_items: u64, +} + +impl ChromProcessCreate for BigBedNoZoomsProcess { + type I = NoZoomsInternalProcessData; + type Out = NoZoomsInternalProcessedData; + fn create(internal_data: Self::I) -> Self { + let NoZoomsInternalProcessData(ftx, chrom_id, options, runtime, chrom, length) = + internal_data; + + let summary = None; + + let items: Vec = Vec::with_capacity(options.items_per_slot as usize); + let zoom_counts: Vec = std::iter::successors(Some(10), |z| Some(z * 4)) + .take_while(|z| *z <= u64::MAX / 4 && *z <= length as u64 * 4) + .map(|z| ZoomCounts { + resolution: z, + current_end: 0, + counts: 0, + }) + .collect(); + + BigBedNoZoomsProcess { + ftx, + chrom_id, + options, + runtime, + chrom, + length, + summary, + items, + overlap: IndexList::new(), + zoom_counts, + total_items: 0, + } } + fn destroy(self) -> Self::Out { + let BigBedNoZoomsProcess { + items, + summary, + zoom_counts, + total_items, + .. + } = self; - let mut summary: Option = None; - - let mut items = Vec::with_capacity(options.items_per_slot as usize); - let mut overlap = IndexList::new(); - let mut zoom_counts: Vec = std::iter::successors(Some(10), |z| Some(z * 4)) - .take_while(|z| *z <= u64::MAX / 4 && *z <= chrom_length as u64 * 4) - .map(|z| ZoomCounts { - resolution: z, - current_end: 0, - counts: 0, - }) - .collect(); - - let mut total_items = 0; - while let Some(current_val) = chrom_values.next() { - // If there is a source error, propogate that up - let current_val = current_val.map_err(ProcessChromError::SourceError)?; - let next_val = match chrom_values.peek() { - Some(Ok(v)) => Some(v), - _ => None, - }; - total_items += 1; + debug_assert!(items.is_empty()); + + let mut summary = summary.unwrap_or(Summary { + total_items: 0, + bases_covered: 0, + min_val: 0.0, + max_val: 0.0, + sum: 0.0, + sum_squares: 0.0, + }); + summary.total_items = total_items; + + let zoom_counts = zoom_counts + .into_iter() + .map(|z| (z.resolution, z.counts)) + .collect(); + + NoZoomsInternalProcessedData(summary, zoom_counts) + } +} + +impl ChromProcess for BigBedNoZoomsProcess { + type Value = BedEntry; + async fn do_process( + &mut self, + current_val: Self::Value, + next_val: Option<&Self::Value>, + ) -> Result<(), ProcessChromError> { + let BigBedNoZoomsProcess { + ftx, + chrom_id, + options, + runtime, + chrom, + length, + summary, + items, + overlap, + zoom_counts, + total_items, + } = self; + + *total_items += 1; let item_start = current_val.start; let item_end = current_val.end; @@ -753,19 +799,19 @@ pub(crate) async fn process_chrom_no_zooms>( BigBedWrite::process_val( current_val, next_val, - chrom_length, + *length, &chrom, - &mut summary, - &mut items, - &mut overlap, - options, + summary, + items, + overlap, + *options, &runtime, - &mut ftx, - chrom_id, + ftx, + *chrom_id, ) .await?; - for zoom in &mut zoom_counts { + for zoom in zoom_counts { if item_start as u64 >= zoom.current_end { zoom.counts += 1; zoom.current_end = item_start as u64 + zoom.resolution; @@ -775,78 +821,85 @@ pub(crate) async fn process_chrom_no_zooms>( zoom.current_end += zoom.resolution; } } - } - - debug_assert!(items.is_empty()); - - let mut summary_complete = match summary { - None => Summary { - total_items: 0, - bases_covered: 0, - min_val: 0.0, - max_val: 0.0, - sum: 0.0, - sum_squares: 0.0, - }, - Some(summary) => summary, - }; - summary_complete.total_items = total_items; - - let zoom_counts = zoom_counts - .into_iter() - .map(|z| (z.resolution, z.counts)) - .collect(); - Ok((summary_complete, zoom_counts)) + Ok(()) + } } -pub(crate) async fn process_chrom_zoom>( - zooms_channels: Vec<(u32, ChromProcessingInputSectionChannel)>, +struct BigBedZoomsProcess { + temp_zoom_items: Vec>, chrom_id: u32, options: BBIWriteOptions, runtime: Handle, - mut chrom_values: I, -) -> Result<(), ProcessChromError> { - let mut zoom_items: Vec = zooms_channels - .into_iter() - .map(|(size, channel)| ZoomItem { - size, - live_info: None, - overlap: IndexList::new(), - records: Vec::with_capacity(options.items_per_slot as usize), - channel, - }) - .collect(); - - while let Some(current_val) = chrom_values.next() { - // If there is a source error, propogate that up - let current_val = current_val.map_err(ProcessChromError::SourceError)?; - let next_val = match chrom_values.peek() { - Some(Ok(v)) => Some(v), - _ => None, - }; - let item_start = current_val.start; - let item_end = current_val.end; + zoom_items: Vec, +} - BigBedWrite::process_val_zoom( - &mut zoom_items, +impl ChromProcessCreate for BigBedZoomsProcess { + type I = ZoomsInternalProcessData; + type Out = ZoomsInternalProcessedData; + fn create(internal_data: Self::I) -> Self { + let ZoomsInternalProcessData(temp_zoom_items, zooms_channels, chrom_id, options, runtime) = + internal_data; + + let zoom_items: Vec = zooms_channels + .into_iter() + .map(|(size, channel)| ZoomItem { + size, + live_info: None, + overlap: IndexList::new(), + records: Vec::with_capacity(options.items_per_slot as usize), + channel, + }) + .collect(); + + BigBedZoomsProcess { + temp_zoom_items, + chrom_id, options, - item_start, - item_end, + runtime, + zoom_items, + } + } + fn destroy(self) -> Self::Out { + let BigBedZoomsProcess { zoom_items, .. } = self; + + for zoom_item in zoom_items.iter() { + debug_assert!(zoom_item.live_info.is_none()); + debug_assert!(zoom_item.records.is_empty()); + } + + ZoomsInternalProcessedData(self.temp_zoom_items) + } +} +impl ChromProcess for BigBedZoomsProcess { + type Value = BedEntry; + async fn do_process( + &mut self, + current_val: Self::Value, + next_val: Option<&Self::Value>, + ) -> Result<(), ProcessChromError> { + let BigBedZoomsProcess { + chrom_id, + options, + runtime, + zoom_items, + .. + } = self; + + BigBedWrite::process_val_zoom( + zoom_items, + *options, + current_val.start, + current_val.end, next_val, &runtime, - chrom_id, + *chrom_id, ) .await?; - } - for zoom_item in zoom_items.iter_mut() { - debug_assert!(zoom_item.live_info.is_none()); - debug_assert!(zoom_item.records.is_empty()); + Ok(()) } - - Ok(()) } async fn encode_section( diff --git a/bigtools/src/bbi/bigwigwrite.rs b/bigtools/src/bbi/bigwigwrite.rs index 212b087..093dc13 100644 --- a/bigtools/src/bbi/bigwigwrite.rs +++ b/bigtools/src/bbi/bigwigwrite.rs @@ -40,14 +40,12 @@ out.write(chrom_map, vals, runtime)?; # } ``` */ -use std::collections::{BTreeMap, HashMap}; +use std::collections::HashMap; use std::error::Error; use std::fs::File; -use std::future::Future; use std::io::{self, BufWriter, Write}; use std::vec; -use futures::channel::mpsc as futures_mpsc; use futures::future::FutureExt; use futures::sink::SinkExt; @@ -56,13 +54,11 @@ use tokio::runtime::{Handle, Runtime}; use crate::bbiwrite::process_internal::ChromProcessCreate; use crate::utils::chromvalues::ChromValues; -use crate::utils::idmap::IdMap; use crate::utils::tell::Tell; -use crate::utils::tempfilebuffer::{TempFileBuffer, TempFileBufferWriter}; use crate::{ - future_channel, write_chroms_with_zooms, write_info, ChromData, ChromData2, ChromProcess, - ChromProcessedData, ChromProcessingInputSectionChannel, InternalProcessData, Section, - TempZoomInfo, ZoomInfo, ZoomValue, + write_info, ChromData, ChromProcess, ChromProcessedData, ChromProcessingInputSectionChannel, + InternalProcessData, InternalTempZoomInfo, NoZoomsInternalProcessData, + NoZoomsInternalProcessedData, ZoomsInternalProcessData, ZoomsInternalProcessedData, }; use crate::bbi::{Summary, Value, ZoomRecord, BIGWIG_MAGIC}; @@ -118,13 +114,10 @@ impl BigWigWrite { } /// Write the values from `V` as a bigWig. Will utilize the provided runtime for encoding values and for reading through the values (potentially parallelized by chromosome). - pub fn write< - Values: ChromValues + Send + 'static, - V: ChromData2, - >( + pub fn write, V: ChromData>( self, chrom_sizes: HashMap, - mut vals: V, + vals: V, runtime: Runtime, ) -> Result<(), ProcessChromError> { let options = self.options; @@ -133,296 +126,22 @@ impl BigWigWrite { let (total_summary_offset, full_data_offset, pre_data) = BigWigWrite::write_pre(&mut file)?; - let zooms_map: BTreeMap = - std::iter::successors(Some(options.initial_zoom_size), |z| Some(z * 4)) - .take(options.max_zooms as usize) - .map(|size| { - let section_iter = vec![]; - let (buf, write): (TempFileBuffer, TempFileBufferWriter) = - TempFileBuffer::new(options.inmemory); - let value = (section_iter, buf, Some(write)); - (size, value) - }) - .collect(); - - let mut chrom_ids = IdMap::default(); - - let mut summary: Option = None; - let (send, recv) = futures_mpsc::unbounded(); - let write_fut = write_chroms_with_zooms(file, zooms_map, recv); - let (write_fut, write_fut_handle) = write_fut.remote_handle(); - runtime.spawn(write_fut); - - let handle = runtime.handle(); - - let setup_chrom = || { - let (ftx, sections_handle, buf, section_receiver) = - future_channel(options.channel_size, runtime.handle(), options.inmemory); - - let (zoom_infos, zooms_channels) = { - let mut zoom_infos = Vec::with_capacity(options.max_zooms as usize); - let mut zooms_channels = Vec::with_capacity(options.max_zooms as usize); - - let zoom_sizes = - std::iter::successors(Some(options.initial_zoom_size), |z| Some(z * 4)) - .take(options.max_zooms as usize); - for size in zoom_sizes { - let (ftx, handle, buf, section_receiver) = - future_channel(options.channel_size, runtime.handle(), options.inmemory); - let zoom_info = TempZoomInfo { - resolution: size, - data_write_future: Box::new(handle), - data: buf, - sections: section_receiver, - }; - zoom_infos.push(zoom_info); - zooms_channels.push((size, ftx)); - } - (zoom_infos, zooms_channels) - }; - - match send.unbounded_send((section_receiver, buf, sections_handle, zoom_infos)) { - Ok(_) => {} - Err(_) => panic!("Expected to always send."), - } - - (zooms_channels, ftx) - }; - let mut do_read = |chrom: String| -> Result<_, ProcessChromError<_>> { - let length = match chrom_sizes.get(&chrom) { - Some(length) => *length, - None => { - return Err(ProcessChromError::InvalidChromosome(format!( - "Input bedGraph contains chromosome that isn't in the input chrom sizes: {}", - chrom - ))); - } - }; - // Make a new id for the chromosome - let chrom_id = chrom_ids.get_id(&chrom); - - let (zooms_channels, ftx) = setup_chrom(); - - let internal_data = crate::InternalProcessData( - zooms_channels, - ftx, - chrom_id, - options, - runtime.handle().clone(), - chrom, - length, - ); - Ok(BigWigFullProcess::create(internal_data)) - }; - - let mut advance = |p: BigWigFullProcess| { - let data = p.destroy(); - let ChromProcessedData(chrom_summary) = data; - match &mut summary { - None => summary = Some(chrom_summary), - Some(summary) => { - summary.total_items += chrom_summary.total_items; - summary.bases_covered += chrom_summary.bases_covered; - summary.min_val = summary.min_val.min(chrom_summary.min_val); - summary.max_val = summary.max_val.max(chrom_summary.max_val); - summary.sum += chrom_summary.sum; - summary.sum_squares += chrom_summary.sum_squares; - } - } - }; - - vals.process_to_bbi(handle, &mut do_read, &mut advance)?; - - drop(send); - - self.write_internal_post( - summary, + let output = bbiwrite::write_vals::<_, _, BigWigFullProcess>( + vals, + file, + options, runtime, - write_fut_handle, - chrom_ids, - pre_data, - chrom_sizes, - full_data_offset, - total_summary_offset, - ) - } - - /// Write the values from `V` as a bigWig. Will utilize the provided runtime for encoding values, but will read through values on the current thread. - pub fn write_singlethreaded< - Values: ChromValues, - V: ChromData2, - >( - self, - chrom_sizes: HashMap, - mut vals: V, - runtime: Runtime, - ) -> Result<(), ProcessChromError> { - let options = self.options; - let fp = File::create(self.path.clone())?; - let mut file = BufWriter::new(fp); - - let (total_summary_offset, full_data_offset, pre_data) = BigWigWrite::write_pre(&mut file)?; - - let zooms_map: BTreeMap = - std::iter::successors(Some(options.initial_zoom_size), |z| Some(z * 4)) - .take(options.max_zooms as usize) - .map(|size| { - let section_iter = vec![]; - let (buf, write): (TempFileBuffer, TempFileBufferWriter) = - TempFileBuffer::new(options.inmemory); - let value = (section_iter, buf, Some(write)); - (size, value) - }) - .collect(); - - let mut chrom_ids = IdMap::default(); - - let mut summary: Option = None; - let (send, recv) = futures_mpsc::unbounded(); - let write_fut = write_chroms_with_zooms(file, zooms_map, recv); - let (write_fut, write_fut_handle) = write_fut.remote_handle(); - runtime.spawn(write_fut); - - let handle = runtime.handle(); - - let setup_chrom = || { - let (ftx, sections_handle, buf, section_receiver) = - future_channel(options.channel_size, runtime.handle(), options.inmemory); - - let (zoom_infos, zooms_channels) = { - let mut zoom_infos = Vec::with_capacity(options.max_zooms as usize); - let mut zooms_channels = Vec::with_capacity(options.max_zooms as usize); - - let zoom_sizes = - std::iter::successors(Some(options.initial_zoom_size), |z| Some(z * 4)) - .take(options.max_zooms as usize); - for size in zoom_sizes { - let (ftx, handle, buf, section_receiver) = - future_channel(options.channel_size, runtime.handle(), options.inmemory); - let zoom_info = TempZoomInfo { - resolution: size, - data_write_future: Box::new(handle), - data: buf, - sections: section_receiver, - }; - zoom_infos.push(zoom_info); - zooms_channels.push((size, ftx)); - } - (zoom_infos, zooms_channels) - }; - - match send.unbounded_send((section_receiver, buf, sections_handle, zoom_infos)) { - Ok(_) => {} - Err(_) => panic!("Expected to always send."), - } - - (zooms_channels, ftx) - }; - let mut do_read = |chrom: String| -> Result<_, ProcessChromError<_>> { - let length = match chrom_sizes.get(&chrom) { - Some(length) => *length, - None => { - return Err(ProcessChromError::InvalidChromosome(format!( - "Input bedGraph contains chromosome that isn't in the input chrom sizes: {}", - chrom - ))); - } - }; - // Make a new id for the chromosome - let chrom_id = chrom_ids.get_id(&chrom); - - let (zooms_channels, ftx) = setup_chrom(); - - let internal_data = crate::InternalProcessData( - zooms_channels, - ftx, - chrom_id, - options, - runtime.handle().clone(), - chrom, - length, - ); - Ok(BigWigFullProcess::create(internal_data)) - }; - - let mut advance = |p: BigWigFullProcess| { - let data = p.destroy(); - let ChromProcessedData(chrom_summary) = data; - match &mut summary { - None => summary = Some(chrom_summary), - Some(summary) => { - summary.total_items += chrom_summary.total_items; - summary.bases_covered += chrom_summary.bases_covered; - summary.min_val = summary.min_val.min(chrom_summary.min_val); - summary.max_val = summary.max_val.max(chrom_summary.max_val); - summary.sum += chrom_summary.sum; - summary.sum_squares += chrom_summary.sum_squares; - } - } - }; - - vals.process_to_bbi(handle, &mut do_read, &mut advance)?; - - drop(send); + chrom_sizes.clone(), + )?; - self.write_internal_post( - summary, - runtime, - write_fut_handle, + let ( chrom_ids, - pre_data, - chrom_sizes, - full_data_offset, - total_summary_offset, - ) - } - - fn write_internal_post( - self, - summary: Option, - runtime: Runtime, - write_fut_handle: impl Future< - Output = Result< - ( - BufWriter, - usize, - Vec>, - BTreeMap, - ), - ProcessChromError, - >, - >, - chrom_ids: IdMap, - pre_data: u64, - chrom_sizes: HashMap, - full_data_offset: u64, - total_summary_offset: u64, - ) -> Result<(), ProcessChromError> { - let summary = summary.unwrap_or(Summary { - total_items: 0, - bases_covered: 0, - min_val: 0.0, - max_val: 0.0, - sum: 0.0, - sum_squares: 0.0, - }); - - let (mut file, max_uncompressed_buf_size, section_iter, zooms_map) = - runtime.block_on(write_fut_handle)?; - - let zoom_infos: Vec = zooms_map - .into_iter() - .map(|(size, zoom)| { - drop(zoom.2); - let sections = zoom.0.into_iter().flatten(); - ZoomInfo { - resolution: size, - data: zoom.1, - sections, - } - }) - .collect(); - let raw_sections_iter = section_iter.into_iter().flatten(); + summary, + mut file, + raw_sections_iter, + zoom_infos, + max_uncompressed_buf_size, + ) = output; let chrom_ids = chrom_ids.get_map(); let (data_size, chrom_index_start, index_start, total_sections) = bbiwrite::write_mid( @@ -476,34 +195,10 @@ impl BigWigWrite { let vals = make_vals()?; - let runtime_handle = runtime.handle(); - - let process_chrom = |ftx: ChromProcessingInputSectionChannel, - chrom_id: u32, - options: BBIWriteOptions, - runtime: Handle, - chrom_values: Values, - chrom: String, - chrom_length: u32| { - let fut = BigWigWrite::process_chrom_no_zooms( - ftx, - chrom_id, - options, - runtime, - chrom_values, - chrom, - chrom_length, - ); - let (fut, handle) = fut.remote_handle(); - runtime_handle.spawn(fut); - handle - }; - - let output = bbiwrite::write_vals_no_zoom( + let output = bbiwrite::write_vals_no_zoom::<_, _, BigWigNoZoomsProcess>( vals, file, self.options, - process_chrom, &runtime, chrom_sizes.clone(), ); @@ -522,10 +217,9 @@ impl BigWigWrite { let vals = make_vals()?; - let output = bbiwrite::write_zoom_vals( + let output = bbiwrite::write_zoom_vals::<_, _, BigWigZoomsProcess<_>>( vals, self.options, - BigWigWrite::process_chrom_zoom, &runtime, &chrom_ids, (summary.bases_covered as f64 / summary.total_items as f64) as u32, @@ -710,134 +404,6 @@ impl BigWigWrite { Ok(()) } - - pub(crate) async fn process_chrom_no_zooms>( - mut ftx: ChromProcessingInputSectionChannel, - chrom_id: u32, - options: BBIWriteOptions, - runtime: Handle, - mut chrom_values: I, - chrom: String, - chrom_length: u32, - ) -> Result<(Summary, Vec<(u64, u64)>), ProcessChromError> { - #[derive(Debug, Copy, Clone)] - struct ZoomCounts { - resolution: u64, - current_end: u64, - counts: u64, - } - - let mut summary = Summary { - total_items: 0, - bases_covered: 0, - min_val: f64::MAX, - max_val: f64::MIN, - sum: 0.0, - sum_squares: 0.0, - }; - - let mut items: Vec = Vec::with_capacity(options.items_per_slot as usize); - let mut zoom_counts: Vec = std::iter::successors(Some(10), |z| Some(z * 4)) - .take_while(|z| *z <= u64::MAX / 4 && *z <= chrom_length as u64 * 4) - .map(|z| ZoomCounts { - resolution: z, - current_end: 0, - counts: 0, - }) - .collect(); - - while let Some(current_val) = chrom_values.next() { - // If there is a source error, propogate that up - let current_val = current_val.map_err(ProcessChromError::SourceError)?; - let next_val = match chrom_values.peek() { - None | Some(Err(_)) => None, - Some(Ok(v)) => Some(v), - }; - - BigWigWrite::process_val( - current_val, - next_val, - chrom_length, - &chrom, - &mut summary, - &mut items, - options, - &runtime, - &mut ftx, - chrom_id, - ) - .await?; - - for zoom in &mut zoom_counts { - if current_val.start as u64 >= zoom.current_end { - zoom.counts += 1; - zoom.current_end = current_val.start as u64 + zoom.resolution; - } - while current_val.end as u64 > zoom.current_end { - zoom.counts += 1; - zoom.current_end += zoom.resolution; - } - } - } - - debug_assert!(items.is_empty()); - - if summary.total_items == 0 { - summary.min_val = 0.0; - summary.max_val = 0.0; - } - - let zoom_counts = zoom_counts - .into_iter() - .map(|z| (z.resolution, z.counts)) - .collect(); - - Ok((summary, zoom_counts)) - } - - pub(crate) async fn process_chrom_zoom>( - zooms_channels: Vec<(u32, ChromProcessingInputSectionChannel)>, - chrom_id: u32, - options: BBIWriteOptions, - runtime: Handle, - mut chrom_values: I, - ) -> Result<(), ProcessChromError> { - let mut zoom_items: Vec = zooms_channels - .into_iter() - .map(|(size, channel)| ZoomItem { - size, - live_info: None, - records: Vec::with_capacity(options.items_per_slot as usize), - channel, - }) - .collect(); - - while let Some(current_val) = chrom_values.next() { - // If there is a source error, propogate that up - let current_val = current_val.map_err(ProcessChromError::SourceError)?; - let next_val = match chrom_values.peek() { - None | Some(Err(_)) => None, - Some(Ok(v)) => Some(v), - }; - - BigWigWrite::process_val_zoom( - &mut zoom_items, - options, - current_val, - next_val, - &runtime, - chrom_id, - ) - .await?; - } - - for zoom_item in zoom_items.iter_mut() { - debug_assert!(zoom_item.live_info.is_none()); - debug_assert!(zoom_item.records.is_empty()); - } - - Ok(()) - } } pub(crate) struct BigWigFullProcess { @@ -854,6 +420,8 @@ pub(crate) struct BigWigFullProcess { } impl ChromProcessCreate for BigWigFullProcess { + type I = InternalProcessData; + type Out = ChromProcessedData; fn create(internal_data: InternalProcessData) -> Self { let InternalProcessData(zooms_channels, ftx, chrom_id, options, runtime, chrom, length) = internal_data; @@ -962,6 +530,209 @@ impl ChromProcess for BigWigFullProcess { } } +#[derive(Debug, Copy, Clone)] +struct ZoomCounts { + resolution: u64, + current_end: u64, + counts: u64, +} +struct BigWigNoZoomsProcess { + ftx: ChromProcessingInputSectionChannel, + chrom_id: u32, + options: BBIWriteOptions, + runtime: Handle, + chrom: String, + length: u32, + + summary: Summary, + items: Vec, + zoom_counts: Vec, +} + +impl ChromProcessCreate for BigWigNoZoomsProcess { + type I = NoZoomsInternalProcessData; + type Out = NoZoomsInternalProcessedData; + fn create(internal_data: Self::I) -> Self { + let NoZoomsInternalProcessData(ftx, chrom_id, options, runtime, chrom, length) = + internal_data; + + let summary = Summary { + total_items: 0, + bases_covered: 0, + min_val: f64::MAX, + max_val: f64::MIN, + sum: 0.0, + sum_squares: 0.0, + }; + + let items: Vec = Vec::with_capacity(options.items_per_slot as usize); + let zoom_counts: Vec = std::iter::successors(Some(10), |z| Some(z * 4)) + .take_while(|z| *z <= u64::MAX / 4 && *z <= length as u64 * 4) + .map(|z| ZoomCounts { + resolution: z, + current_end: 0, + counts: 0, + }) + .collect(); + + BigWigNoZoomsProcess { + ftx, + chrom_id, + options, + runtime, + chrom, + length, + summary, + items, + zoom_counts, + } + } + fn destroy(self) -> Self::Out { + let BigWigNoZoomsProcess { + items, + mut summary, + zoom_counts, + .. + } = self; + + debug_assert!(items.is_empty()); + + if summary.total_items == 0 { + summary.min_val = 0.0; + summary.max_val = 0.0; + } + + let zoom_counts = zoom_counts + .into_iter() + .map(|z| (z.resolution, z.counts)) + .collect(); + + NoZoomsInternalProcessedData(summary, zoom_counts) + } +} + +impl ChromProcess for BigWigNoZoomsProcess { + type Value = Value; + async fn do_process( + &mut self, + current_val: Self::Value, + next_val: Option<&Self::Value>, + ) -> Result<(), ProcessChromError> { + let BigWigNoZoomsProcess { + ftx, + chrom_id, + options, + runtime, + chrom, + length, + summary, + items, + zoom_counts, + } = self; + + BigWigWrite::process_val( + current_val, + next_val, + *length, + &chrom, + summary, + items, + *options, + &runtime, + ftx, + *chrom_id, + ) + .await?; + + for zoom in zoom_counts { + if current_val.start as u64 >= zoom.current_end { + zoom.counts += 1; + zoom.current_end = current_val.start as u64 + zoom.resolution; + } + while current_val.end as u64 > zoom.current_end { + zoom.counts += 1; + zoom.current_end += zoom.resolution; + } + } + + Ok(()) + } +} + +struct BigWigZoomsProcess { + temp_zoom_items: Vec>, + chrom_id: u32, + options: BBIWriteOptions, + runtime: Handle, + + zoom_items: Vec, +} + +impl ChromProcessCreate for BigWigZoomsProcess { + type I = ZoomsInternalProcessData; + type Out = ZoomsInternalProcessedData; + fn create(internal_data: Self::I) -> Self { + let ZoomsInternalProcessData(temp_zoom_items, zooms_channels, chrom_id, options, runtime) = + internal_data; + + let zoom_items: Vec = zooms_channels + .into_iter() + .map(|(size, channel)| ZoomItem { + size, + live_info: None, + records: Vec::with_capacity(options.items_per_slot as usize), + channel, + }) + .collect(); + + BigWigZoomsProcess { + temp_zoom_items, + chrom_id, + options, + runtime, + zoom_items, + } + } + fn destroy(self) -> Self::Out { + let BigWigZoomsProcess { zoom_items, .. } = self; + + for zoom_item in zoom_items.iter() { + debug_assert!(zoom_item.live_info.is_none()); + debug_assert!(zoom_item.records.is_empty()); + } + + ZoomsInternalProcessedData(self.temp_zoom_items) + } +} +impl ChromProcess for BigWigZoomsProcess { + type Value = Value; + async fn do_process( + &mut self, + current_val: Self::Value, + next_val: Option<&Self::Value>, + ) -> Result<(), ProcessChromError> { + let BigWigZoomsProcess { + chrom_id, + options, + runtime, + zoom_items, + .. + } = self; + + BigWigWrite::process_val_zoom( + zoom_items, + *options, + current_val, + next_val, + &runtime, + *chrom_id, + ) + .await?; + + Ok(()) + } +} + async fn encode_section( compress: bool, items_in_section: Vec, diff --git a/bigtools/src/utils/cli/bedgraphtobigwig.rs b/bigtools/src/utils/cli/bedgraphtobigwig.rs index 35317cd..867e3ec 100644 --- a/bigtools/src/utils/cli/bedgraphtobigwig.rs +++ b/bigtools/src/utils/cli/bedgraphtobigwig.rs @@ -102,7 +102,7 @@ pub fn bedgraphtobigwig(args: BedGraphToBigWigArgs) -> Result<(), Box let vals_iter = BedParser::from_bedgraph_file(stdin); let chsi = BedParserStreamingIterator::new(vals_iter, allow_out_of_order_chroms); - outb.write_singlethreaded(chrom_map, chsi, runtime)?; + outb.write(chrom_map, chsi, runtime)?; } else { let infile = File::open(&bedgraphpath)?; let (parallel, parallel_required) = match (nthreads, args.parallel.as_ref()) { diff --git a/bigtools/src/utils/cli/bigwigmerge.rs b/bigtools/src/utils/cli/bigwigmerge.rs index fb2dc82..e08b350 100644 --- a/bigtools/src/utils/cli/bigwigmerge.rs +++ b/bigtools/src/utils/cli/bigwigmerge.rs @@ -11,9 +11,8 @@ use crate::utils::chromvalues::ChromValues; use crate::utils::merge::merge_sections_many; use crate::utils::reopen::ReopenableFile; use crate::{BBIReadError, BigWigRead, BigWigWrite, ChromProcess}; -use crate::{ChromData, ChromDataState, ChromProcessingKey, ProcessChromError}; -use crate::{ChromData2, Value}; -use tokio::runtime; +use crate::{ChromData, ProcessChromError, Value}; +use tokio::runtime::{self, Runtime}; use super::BBIWriteArgs; @@ -361,45 +360,13 @@ struct ChromGroupReadImpl { impl ChromData for ChromGroupReadImpl { type Values = MergingValues; - - fn advance< - State, - F: FnMut( - String, - MergingValues, - &mut State, - ) -> Result>, - >( - &mut self, - do_read: &mut F, - state: &mut State, - ) -> Result< - ChromDataState, - ProcessChromError, - > { - let next: Option> = - self.iter.next(); - Ok(match next { - Some(Err(err)) => ChromDataState::Error(err.into()), - Some(Ok((chrom, _, mergingvalues))) => { - let read = do_read(chrom, mergingvalues, state)?; - - ChromDataState::NewChrom(read) - } - None => ChromDataState::Finished, - }) - } -} - -impl ChromData2 for ChromGroupReadImpl { - type Values = MergingValues; fn process_to_bbi< P: ChromProcess::Value>, StartProcessing: FnMut(String) -> Result::Error>>, - Advance: FnMut(P), + Advance: FnMut(P) -> Result<(), ProcessChromError<::Error>>, >( &mut self, - runtime: &runtime::Handle, + runtime: &Runtime, start_processing: &mut StartProcessing, advance: &mut Advance, ) -> Result<(), ProcessChromError<::Error>> { @@ -422,7 +389,7 @@ impl ChromData2 for ChromGroupReadImpl { runtime.block_on(read)?; } - advance(p); + advance(p)?; } Some(Err(e)) => return Err(ProcessChromError::SourceError(e)), None => break, diff --git a/bigtools/tests/bigwigwrite.rs b/bigtools/tests/bigwigwrite.rs index cf7370e..07c9e0c 100644 --- a/bigtools/tests/bigwigwrite.rs +++ b/bigtools/tests/bigwigwrite.rs @@ -192,6 +192,5 @@ fn test_iter() { let tempfile = tempfile::NamedTempFile::new().unwrap(); let outb = BigWigWrite::create_file(tempfile.path().to_string_lossy().to_string()); - outb.write_singlethreaded(chrom_map, vals_iter, runtime) - .unwrap(); + outb.write(chrom_map, vals_iter, runtime).unwrap(); } From 4c21582472fc306cf2f946f4deb1827b322cae24 Mon Sep 17 00:00:00 2001 From: Jack Huey <31162821+jackh726@users.noreply.github.com> Date: Fri, 19 Apr 2024 21:32:28 -0400 Subject: [PATCH 10/31] Don't call block_on inside task --- bigtools/src/bbi/bedchromdata.rs | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/bigtools/src/bbi/bedchromdata.rs b/bigtools/src/bbi/bedchromdata.rs index e0ecdb6..a978146 100644 --- a/bigtools/src/bbi/bedchromdata.rs +++ b/bigtools/src/bbi/bedchromdata.rs @@ -164,7 +164,6 @@ impl ChromData for BedParserParallelStreamingIterator { } let mut p = start_processing(chrom)?; - let runtime_handle = runtime.handle().clone(); let data: tokio::task::JoinHandle< Result>, > = runtime.spawn(async move { @@ -177,8 +176,8 @@ impl ChromData for BedParserParallelStreamingIterator { Some(Ok(v)) => Some(v), }; - let read = p.do_process(current_val, next_val); - runtime_handle.block_on(read)?; + let read = p.do_process::(current_val, next_val); + read.await?; } Ok(p) }); @@ -275,24 +274,19 @@ mod tests { _next_val: Option<&Self::Value>, ) -> Result<(), ProcessChromError> { self.count += 1; - dbg!(self.count); Ok(()) } } - let mut start_processing = |chrom: String| { - dbg!(chrom); - Ok(TestChromProcess::create(())) - }; + let mut start_processing = |_: String| Ok(TestChromProcess::create(())); let mut advance = |p: TestChromProcess| { - dbg!(p.count); counts.push(p.count); let _ = p.destroy(); Ok(()) }; chsi.process_to_bbi(&runtime, &mut start_processing, &mut advance) .unwrap(); - assert_eq!(counts, vec![]); + assert_eq!(counts, vec![200, 200, 200, 200, 200, 2000]); Ok(()) } From 4523905cd4ea5a77289301b152c3e831fe3847db Mon Sep 17 00:00:00 2001 From: Jack Huey <31162821+jackh726@users.noreply.github.com> Date: Sat, 20 Apr 2024 12:15:28 -0400 Subject: [PATCH 11/31] Use FileView for BedParserParallelStreamingIterator --- bigtools/src/bbi/bedchromdata.rs | 11 ++++++----- bigtools/src/utils/file/file_view.rs | 2 ++ 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/bigtools/src/bbi/bedchromdata.rs b/bigtools/src/bbi/bedchromdata.rs index a978146..1745979 100644 --- a/bigtools/src/bbi/bedchromdata.rs +++ b/bigtools/src/bbi/bedchromdata.rs @@ -9,7 +9,7 @@ use std::collections::VecDeque; use std::fs::File; -use std::io::{BufReader, Seek, SeekFrom}; +use std::io::BufReader; use std::path::PathBuf; use tokio::runtime::Runtime; @@ -18,6 +18,7 @@ use crate::bed::bedparser::{ BedChromData, BedFileStream, BedParser, BedValueError, Parser, StateValue, StreamingBedValues, }; use crate::utils::chromvalues::ChromValues; +use crate::utils::file_view::FileView; use crate::utils::streaming_linereader::StreamingLineReader; use crate::{ChromData, ChromProcess, ProcessChromError}; @@ -135,19 +136,19 @@ impl ChromData for BedParserParallelStreamingIterator { let mut queued_reads: VecDeque<_> = VecDeque::new(); loop { while remaining && queued_reads.len() < (4 + 1) { - let curr = match self.chrom_indices.pop() { - Some(c) => c, + let (curr, next) = match self.chrom_indices.pop() { + Some(c) => (c, self.chrom_indices.get(0)), None => { remaining = false; break; } }; - let mut file = match File::open(&self.path) { + let file = match File::open(&self.path) { Ok(f) => f, Err(err) => return Err(ProcessChromError::SourceError(err.into())), }; - file.seek(SeekFrom::Start(curr.0))?; + let file = FileView::new(file, curr.0, next.map(|n| n.0).unwrap_or(u64::MAX))?; let mut parser = BedParser::new(BedFileStream { bed: StreamingLineReader::new(BufReader::new(file)), parse: self.parse_fn, diff --git a/bigtools/src/utils/file/file_view.rs b/bigtools/src/utils/file/file_view.rs index 2cf0a49..671bda4 100644 --- a/bigtools/src/utils/file/file_view.rs +++ b/bigtools/src/utils/file/file_view.rs @@ -9,6 +9,8 @@ pub struct FileView { } impl FileView { + /// Creates a File view between two file offsets. Note that `end` will be + /// truncated to the actual end of the file. pub fn new(mut file: File, start: u64, end: u64) -> io::Result { let file_end = file.seek(io::SeekFrom::End(0))?; file.seek(io::SeekFrom::Start(start))?; From c647ee040b8277aa2195cf75995648b5e5cfa8c4 Mon Sep 17 00:00:00 2001 From: Jack Huey <31162821+jackh726@users.noreply.github.com> Date: Sat, 20 Apr 2024 12:39:29 -0400 Subject: [PATCH 12/31] Uplift Value and Error from ChromValues to ChromData --- bigtools/src/bbi/bbiwrite.rs | 41 +++++++++++++-------------- bigtools/src/bbi/bedchromdata.rs | 22 +++++++------- bigtools/src/bbi/bigbedwrite.rs | 20 ++++++------- bigtools/src/bbi/bigwigwrite.rs | 20 ++++++------- bigtools/src/utils/cli/bigwigmerge.rs | 12 ++++---- 5 files changed, 54 insertions(+), 61 deletions(-) diff --git a/bigtools/src/bbi/bbiwrite.rs b/bigtools/src/bbi/bbiwrite.rs index edeeada..548909f 100644 --- a/bigtools/src/bbi/bbiwrite.rs +++ b/bigtools/src/bbi/bbiwrite.rs @@ -19,7 +19,6 @@ use futures::stream::StreamExt; use serde::{Deserialize, Serialize}; use tokio::runtime::{Handle, Runtime}; -use crate::utils::chromvalues::ChromValues; use crate::utils::idmap::IdMap; use crate::utils::tell::Tell; use crate::utils::tempfilebuffer::{TempFileBuffer, TempFileBufferWriter}; @@ -557,18 +556,19 @@ pub struct ChromProcessedData(pub(crate) Summary); /// Effectively like an Iterator of chromosome data pub trait ChromData: Sized { - type Values: ChromValues; + type Value; + type Error: Error + Send + 'static; fn process_to_bbi< - P: ChromProcess::Value> + Send + 'static, - StartProcessing: FnMut(String) -> Result::Error>>, - Advance: FnMut(P) -> Result<(), ProcessChromError<::Error>>, + P: ChromProcess + Send + 'static, + StartProcessing: FnMut(String) -> Result>, + Advance: FnMut(P) -> Result<(), ProcessChromError>, >( &mut self, runtime: &Runtime, start_processing: &mut StartProcessing, advance: &mut Advance, - ) -> Result<(), ProcessChromError<::Error>>; + ) -> Result<(), ProcessChromError>; } // Zooms have to be double-buffered: first because chroms could be processed in parallel and second because we don't know the offset of each zoom immediately @@ -714,9 +714,8 @@ pub trait ChromProcess: process_internal::ChromProcessCreate { } pub(crate) fn write_vals< - Values: ChromValues, - V: ChromData, - P: ChromProcess + V: ChromData, + P: ChromProcess + process_internal::ChromProcessCreate + Send + 'static, @@ -735,7 +734,7 @@ pub(crate) fn write_vals< Vec, usize, ), - ProcessChromError, + ProcessChromError, > { let zooms_map: BTreeMap = std::iter::successors(Some(options.initial_zoom_size), |z| Some(z * 4)) @@ -884,9 +883,8 @@ pub(crate) struct NoZoomsInternalProcessData( pub(crate) struct NoZoomsInternalProcessedData(pub(crate) Summary, pub(crate) Vec<(u64, u64)>); pub(crate) fn write_vals_no_zoom< - Values: ChromValues, - V: ChromData, - P: ChromProcess + V: ChromData, + P: ChromProcess + process_internal::ChromProcessCreate< I = NoZoomsInternalProcessData, Out = NoZoomsInternalProcessedData, @@ -907,7 +905,7 @@ pub(crate) fn write_vals_no_zoom< Flatten>>, usize, ), - ProcessChromError, + ProcessChromError, > { let total_zoom_counts = std::iter::successors(Some(10), |z: &u64| Some((*z).saturating_mul(4))) .take_while(|z| *z < u64::MAX) @@ -918,7 +916,7 @@ pub(crate) fn write_vals_no_zoom< let mut summary: Option = None; let (send, recv) = futures_mpsc::unbounded(); - let write_fut = write_chroms_without_zooms::(file, recv); + let write_fut = write_chroms_without_zooms::(file, recv); let (write_fut, write_fut_handle) = write_fut.remote_handle(); runtime.spawn(write_fut); @@ -1035,12 +1033,11 @@ pub(crate) struct ZoomsInternalProcessData( pub(crate) struct ZoomsInternalProcessedData(pub(crate) Vec>); pub(crate) fn write_zoom_vals< - Values: ChromValues, - V: ChromData, - P: ChromProcess + V: ChromData, + P: ChromProcess + process_internal::ChromProcessCreate< - I = ZoomsInternalProcessData, - Out = ZoomsInternalProcessedData, + I = ZoomsInternalProcessData, + Out = ZoomsInternalProcessedData, > + Send + 'static, >( @@ -1052,7 +1049,7 @@ pub(crate) fn write_zoom_vals< zoom_counts: BTreeMap, mut file: BufWriter, data_size: u64, -) -> Result<(BufWriter, Vec, usize), ProcessChromError> { +) -> Result<(BufWriter, Vec, usize), ProcessChromError> { let min_first_zoom_size = average_size.max(10) * 4; let mut zooms_map: BTreeMap = zoom_counts .into_iter() @@ -1083,7 +1080,7 @@ pub(crate) fn write_zoom_vals< let mut max_uncompressed_buf_size = 0; - let mut do_read = |chrom: String| -> Result> { + let mut do_read = |chrom: String| -> Result> { // Make a new id for the chromosome let chrom_id = *chrom_ids .get(&chrom) diff --git a/bigtools/src/bbi/bedchromdata.rs b/bigtools/src/bbi/bedchromdata.rs index 1745979..e1c0ed1 100644 --- a/bigtools/src/bbi/bedchromdata.rs +++ b/bigtools/src/bbi/bedchromdata.rs @@ -39,18 +39,19 @@ impl BedParserStreamingIterator { } impl ChromData for BedParserStreamingIterator { - type Values = BedChromData; + type Value = S::Value; + type Error = BedValueError; fn process_to_bbi< - P: ChromProcess::Value>, - StartProcessing: FnMut(String) -> Result::Error>>, - Advance: FnMut(P) -> Result<(), ProcessChromError<::Error>>, + P: ChromProcess, + StartProcessing: FnMut(String) -> Result>, + Advance: FnMut(P) -> Result<(), ProcessChromError>, >( &mut self, runtime: &Runtime, start_processing: &mut StartProcessing, advance: &mut Advance, - ) -> Result<(), ProcessChromError<::Error>> { + ) -> Result<(), ProcessChromError> { loop { match self.bed_data.next_chrom() { Some(Ok((chrom, mut group))) => { @@ -120,18 +121,19 @@ impl BedParserParallelStreamingIterator { } impl ChromData for BedParserParallelStreamingIterator { - type Values = BedChromData>>; + type Value = V; + type Error = BedValueError; fn process_to_bbi< - P: ChromProcess::Value> + Send + 'static, - StartProcessing: FnMut(String) -> Result::Error>>, - Advance: FnMut(P) -> Result<(), ProcessChromError<::Error>>, + P: ChromProcess + Send + 'static, + StartProcessing: FnMut(String) -> Result>, + Advance: FnMut(P) -> Result<(), ProcessChromError>, >( &mut self, runtime: &Runtime, start_processing: &mut StartProcessing, advance: &mut Advance, - ) -> Result<(), ProcessChromError> { + ) -> Result<(), ProcessChromError> { let mut remaining = true; let mut queued_reads: VecDeque<_> = VecDeque::new(); loop { diff --git a/bigtools/src/bbi/bigbedwrite.rs b/bigtools/src/bbi/bigbedwrite.rs index ec27a16..80bde65 100644 --- a/bigtools/src/bbi/bigbedwrite.rs +++ b/bigtools/src/bbi/bigbedwrite.rs @@ -11,7 +11,6 @@ use byteorder::{NativeEndian, WriteBytesExt}; use tokio::runtime::{Handle, Runtime}; use crate::bbiwrite::process_internal::ChromProcessCreate; -use crate::utils::chromvalues::ChromValues; use crate::utils::indexlist::IndexList; use crate::utils::tell::Tell; use crate::{ @@ -82,19 +81,19 @@ impl BigBedWrite { } /// Write the values from `V` as a bigWig. Will utilize the provided runtime for encoding values and for reading through the values (potentially parallelized by chromosome). - pub fn write, V: ChromData>( + pub fn write>( self, chrom_sizes: HashMap, vals: V, runtime: Runtime, - ) -> Result<(), ProcessChromError> { + ) -> Result<(), ProcessChromError> { let fp = File::create(self.path.clone())?; let mut file = BufWriter::new(fp); let (autosql_offset, total_summary_offset, full_data_offset, pre_data) = BigBedWrite::write_pre(&mut file, &self.autosql)?; - let output = bbiwrite::write_vals::<_, _, BigBedFullProcess>( + let output = bbiwrite::write_vals::<_, BigBedFullProcess>( vals, file, self.options, @@ -142,15 +141,12 @@ impl BigBedWrite { /// Write the values from `V` as a bigBed. Will utilize the provided runtime for encoding values and for reading through the values (potentially parallelized by chromosome). /// This will take two passes on the provided values: first to write the values themselves, then the zooms. This is beneficial over `write` on smaller files, where the encoding of /// high resolution zooms takes up a substantial portion of total processing time. - pub fn write_multipass< - Values: ChromValues + Send + 'static, - V: ChromData, - >( + pub fn write_multipass>( self, - make_vals: impl Fn() -> Result>, + make_vals: impl Fn() -> Result>, chrom_sizes: HashMap, runtime: Runtime, - ) -> Result<(), ProcessChromError> { + ) -> Result<(), ProcessChromError> { let fp = File::create(self.path.clone())?; let mut file = BufWriter::new(fp); @@ -159,7 +155,7 @@ impl BigBedWrite { let vals = make_vals()?; - let output = bbiwrite::write_vals_no_zoom::<_, _, BigBedNoZoomsProcess>( + let output = bbiwrite::write_vals_no_zoom::<_, BigBedNoZoomsProcess>( vals, file, self.options, @@ -181,7 +177,7 @@ impl BigBedWrite { let vals = make_vals()?; - let output = bbiwrite::write_zoom_vals::<_, _, BigBedZoomsProcess<_>>( + let output = bbiwrite::write_zoom_vals::<_, BigBedZoomsProcess<_>>( vals, self.options, &runtime, diff --git a/bigtools/src/bbi/bigwigwrite.rs b/bigtools/src/bbi/bigwigwrite.rs index 093dc13..27719b3 100644 --- a/bigtools/src/bbi/bigwigwrite.rs +++ b/bigtools/src/bbi/bigwigwrite.rs @@ -53,7 +53,6 @@ use byteorder::{NativeEndian, WriteBytesExt}; use tokio::runtime::{Handle, Runtime}; use crate::bbiwrite::process_internal::ChromProcessCreate; -use crate::utils::chromvalues::ChromValues; use crate::utils::tell::Tell; use crate::{ write_info, ChromData, ChromProcess, ChromProcessedData, ChromProcessingInputSectionChannel, @@ -114,19 +113,19 @@ impl BigWigWrite { } /// Write the values from `V` as a bigWig. Will utilize the provided runtime for encoding values and for reading through the values (potentially parallelized by chromosome). - pub fn write, V: ChromData>( + pub fn write>( self, chrom_sizes: HashMap, vals: V, runtime: Runtime, - ) -> Result<(), ProcessChromError> { + ) -> Result<(), ProcessChromError> { let options = self.options; let fp = File::create(self.path.clone())?; let mut file = BufWriter::new(fp); let (total_summary_offset, full_data_offset, pre_data) = BigWigWrite::write_pre(&mut file)?; - let output = bbiwrite::write_vals::<_, _, BigWigFullProcess>( + let output = bbiwrite::write_vals::<_, BigWigFullProcess>( vals, file, options, @@ -179,15 +178,12 @@ impl BigWigWrite { /// Write the values from `V` as a bigWig. Will utilize the provided runtime for encoding values and for reading through the values (potentially parallelized by chromosome). /// This will take two passes on the provided values: first to write the values themselves, then the zooms. This is beneficial over `write` on smaller files, where the encoding of /// high resolution zooms takes up a substantial portion of total processing time. - pub fn write_multipass< - Values: ChromValues + Send + 'static, - V: ChromData, - >( + pub fn write_multipass>( self, - make_vals: impl Fn() -> Result>, + make_vals: impl Fn() -> Result>, chrom_sizes: HashMap, runtime: Runtime, - ) -> Result<(), ProcessChromError> { + ) -> Result<(), ProcessChromError> { let fp = File::create(self.path.clone())?; let mut file = BufWriter::new(fp); @@ -195,7 +191,7 @@ impl BigWigWrite { let vals = make_vals()?; - let output = bbiwrite::write_vals_no_zoom::<_, _, BigWigNoZoomsProcess>( + let output = bbiwrite::write_vals_no_zoom::<_, BigWigNoZoomsProcess>( vals, file, self.options, @@ -217,7 +213,7 @@ impl BigWigWrite { let vals = make_vals()?; - let output = bbiwrite::write_zoom_vals::<_, _, BigWigZoomsProcess<_>>( + let output = bbiwrite::write_zoom_vals::<_, BigWigZoomsProcess<_>>( vals, self.options, &runtime, diff --git a/bigtools/src/utils/cli/bigwigmerge.rs b/bigtools/src/utils/cli/bigwigmerge.rs index e08b350..53475e8 100644 --- a/bigtools/src/utils/cli/bigwigmerge.rs +++ b/bigtools/src/utils/cli/bigwigmerge.rs @@ -359,17 +359,19 @@ struct ChromGroupReadImpl { } impl ChromData for ChromGroupReadImpl { - type Values = MergingValues; + type Value = Value; + type Error = MergingValuesError; + fn process_to_bbi< - P: ChromProcess::Value>, - StartProcessing: FnMut(String) -> Result::Error>>, - Advance: FnMut(P) -> Result<(), ProcessChromError<::Error>>, + P: ChromProcess, + StartProcessing: FnMut(String) -> Result>, + Advance: FnMut(P) -> Result<(), ProcessChromError>, >( &mut self, runtime: &Runtime, start_processing: &mut StartProcessing, advance: &mut Advance, - ) -> Result<(), ProcessChromError<::Error>> { + ) -> Result<(), ProcessChromError> { loop { let next: Option> = self.iter.next(); From cf0dc857b09920a8912cafb1c1aa51f83c732ada Mon Sep 17 00:00:00 2001 From: Jack Huey <31162821+jackh726@users.noreply.github.com> Date: Sat, 20 Apr 2024 13:13:55 -0400 Subject: [PATCH 13/31] Remove ChromValues --- bigtools/src/bbi/bedchromdata.rs | 24 ++++-------- bigtools/src/bed/bedparser.rs | 32 ++++++++-------- bigtools/src/utils/chromvalues.rs | 10 ----- bigtools/src/utils/cli/bedtobigbed.rs | 3 +- bigtools/src/utils/cli/bigwigmerge.rs | 53 +++++++++++---------------- bigtools/src/utils/mod.rs | 1 - bigtools/tests/bigbedwrite.rs | 3 +- bigtools/tests/bigwigwrite.rs | 1 - 8 files changed, 45 insertions(+), 82 deletions(-) delete mode 100644 bigtools/src/utils/chromvalues.rs diff --git a/bigtools/src/bbi/bedchromdata.rs b/bigtools/src/bbi/bedchromdata.rs index e1c0ed1..8a2f0d2 100644 --- a/bigtools/src/bbi/bedchromdata.rs +++ b/bigtools/src/bbi/bedchromdata.rs @@ -17,7 +17,6 @@ use tokio::runtime::Runtime; use crate::bed::bedparser::{ BedChromData, BedFileStream, BedParser, BedValueError, Parser, StateValue, StreamingBedValues, }; -use crate::utils::chromvalues::ChromValues; use crate::utils::file_view::FileView; use crate::utils::streaming_linereader::StreamingLineReader; use crate::{ChromData, ChromProcess, ProcessChromError}; @@ -69,10 +68,7 @@ impl ChromData for BedParserStreamingIterator { while let Some(current_val) = group.next() { // If there is a source error, propogate that up let current_val = current_val.map_err(ProcessChromError::SourceError)?; - let next_val = match group.peek() { - None | Some(Err(_)) => None, - Some(Ok(v)) => Some(v), - }; + let next_val = group.peek_val(); let read = p.do_process(current_val, next_val); runtime.block_on(read)?; @@ -174,10 +170,7 @@ impl ChromData for BedParserParallelStreamingIterator { // If there is a source error, propogate that up let current_val = current_val.map_err(ProcessChromError::SourceError)?; - let next_val = match group.peek() { - None | Some(Err(_)) => None, - Some(Ok(v)) => Some(v), - }; + let next_val = group.peek_val(); let read = p.do_process::(current_val, next_val); read.await?; @@ -203,11 +196,8 @@ impl ChromData for BedParserParallelStreamingIterator { } } -impl ChromValues for BedChromData { - type Value = S::Value; - type Error = BedValueError; - - fn next(&mut self) -> Option> { +impl BedChromData { + pub fn next(&mut self) -> Option> { let state = self.load_state()?; let ret = state.load_state_and_take_value(); if matches!(state.state_value, StateValue::DiffChrom(..)) { @@ -216,15 +206,15 @@ impl ChromValues for BedChromData { ret } - fn peek(&mut self) -> Option> { + pub fn peek_val(&mut self) -> Option<&S::Value> { let state = self.load_state()?; state.load_state(false); let ret = match &state.state_value { StateValue::Empty => None, - StateValue::Value(_, val) => Some(Ok(val)), + StateValue::Value(_, val) => Some(val), StateValue::EmptyValue(_) => None, // Shouldn't occur StateValue::DiffChrom(_, _) => None, // Only `Value` is peekable - StateValue::Error(err) => Some(Err(err)), + StateValue::Error(_) => None, StateValue::Done => None, }; ret diff --git a/bigtools/src/bed/bedparser.rs b/bigtools/src/bed/bedparser.rs index a18788d..ef251f8 100644 --- a/bigtools/src/bed/bedparser.rs +++ b/bigtools/src/bed/bedparser.rs @@ -422,14 +422,12 @@ impl Drop for BedChromData { } } -#[cfg(all(test, features = "write"))] +#[cfg(test)] mod tests { use super::*; use std::fs::File; use std::path::PathBuf; - use crate::utils::chromvalues::ChromValues; - #[test] fn test_bed_works() { let mut dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")); @@ -452,7 +450,7 @@ mod tests { end: $end, rest: $rest.to_string() }, - $group.peek().unwrap().unwrap() + $group.peek_val().unwrap() ); }; (next $group:expr, $start:literal $end:literal $rest:expr) => { @@ -482,29 +480,29 @@ mod tests { check_value!(peek next group, 1 100 "test1\t0"); check_value!(peek next group, 101 200 "test2\t0"); check_value!(peek next group, 201 300 "test3\t0"); - assert!(group.peek().is_none()); + assert!(group.peek_val().is_none()); assert!(group.next().is_none()); - assert!(group.peek().is_none()); + assert!(group.peek_val().is_none()); } { let (chrom, mut group) = bgp.next_chrom().unwrap().unwrap(); check_value!(chrom "chr18"); check_value!(peek next group, 1 100 "test4\t0"); check_value!(peek next group, 101 200 "test5\t0"); - assert!(group.peek().is_none()); + assert!(group.peek_val().is_none()); assert!(group.next().is_none()); - assert!(group.peek().is_none()); + assert!(group.peek_val().is_none()); } { let (chrom, mut group) = bgp.next_chrom().unwrap().unwrap(); check_value!(chrom "chr19"); check_value!(peek next group, 1 100 "test6\t0"); - assert!(group.peek().is_none()); + assert!(group.peek_val().is_none()); assert!(group.next().is_none()); - assert!(group.peek().is_none()); + assert!(group.peek_val().is_none()); } assert!(matches!(bgp.next_chrom(), None)); } @@ -531,7 +529,7 @@ mod tests { end: $end, value: 0.5, }, - $group.peek().unwrap().unwrap() + $group.peek_val().unwrap() ); }; (next $group:expr, $start:literal $end:literal) => { @@ -561,29 +559,29 @@ mod tests { check_value!(peek next group, 1 100); check_value!(peek next group, 101 200); check_value!(peek next group, 201 300); - assert!(group.peek().is_none()); + assert!(group.peek_val().is_none()); assert!(group.next().is_none()); - assert!(group.peek().is_none()); + assert!(group.peek_val().is_none()); } { let (chrom, mut group) = bgp.next_chrom().unwrap().unwrap(); check_value!(chrom "chr18"); check_value!(peek next group, 1 100); check_value!(peek next group, 101 200); - assert!(group.peek().is_none()); + assert!(group.peek_val().is_none()); assert!(group.next().is_none()); - assert!(group.peek().is_none()); + assert!(group.peek_val().is_none()); } { let (chrom, mut group) = bgp.next_chrom().unwrap().unwrap(); check_value!(chrom "chr19"); check_value!(peek next group, 1 100); - assert!(group.peek().is_none()); + assert!(group.peek_val().is_none()); assert!(group.next().is_none()); - assert!(group.peek().is_none()); + assert!(group.peek_val().is_none()); } assert!(matches!(bgp.next_chrom(), None)); } diff --git a/bigtools/src/utils/chromvalues.rs b/bigtools/src/utils/chromvalues.rs deleted file mode 100644 index 3fcc137..0000000 --- a/bigtools/src/utils/chromvalues.rs +++ /dev/null @@ -1,10 +0,0 @@ -use std::error::Error; -use std::io; - -pub trait ChromValues { - type Value; - type Error: Error + Send + From + 'static; - - fn next(&mut self) -> Option>; - fn peek(&mut self) -> Option>; -} diff --git a/bigtools/src/utils/cli/bedtobigbed.rs b/bigtools/src/utils/cli/bedtobigbed.rs index 8e3b10f..28fa4fd 100644 --- a/bigtools/src/utils/cli/bedtobigbed.rs +++ b/bigtools/src/utils/cli/bedtobigbed.rs @@ -107,11 +107,10 @@ pub fn bedtobigbed(args: BedToBigBedArgs) -> Result<(), Box> { let autosql = match args.autosql.as_ref() { None => { - use crate::utils::chromvalues::ChromValues; let infile = File::open(&bedpath)?; let mut vals_iter = BedParser::from_bed_file(infile); let (_, mut group) = vals_iter.next_chrom().unwrap().unwrap(); - let first = group.peek().unwrap().unwrap(); + let first = group.peek_val().unwrap(); crate::bed::autosql::bed_autosql(&first.rest) } Some(file) => std::fs::read_to_string(file)?, diff --git a/bigtools/src/utils/cli/bigwigmerge.rs b/bigtools/src/utils/cli/bigwigmerge.rs index 53475e8..91c97d4 100644 --- a/bigtools/src/utils/cli/bigwigmerge.rs +++ b/bigtools/src/utils/cli/bigwigmerge.rs @@ -7,7 +7,6 @@ use clap::Parser; use crossbeam_channel::unbounded; use thiserror::Error; -use crate::utils::chromvalues::ChromValues; use crate::utils::merge::merge_sections_many; use crate::utils::reopen::ReopenableFile; use crate::{BBIReadError, BigWigRead, BigWigWrite, ChromProcess}; @@ -146,8 +145,13 @@ pub fn bigwigmerge(args: BigWigMergeArgs) -> Result<(), Box> { for v in iter { let (chrom, _, mut values) = v?; - while let Some(val) = values.next() { - let val = val?; + + loop { + let val = match values.iter.next() { + Some(Ok(v)) => v, + Some(Err(e)) => Err(e)?, + None => break, + }; writer.write_fmt(format_args!( "{}\t{}\t{}\t{}\n", chrom, val.start, val.end, val.value @@ -208,27 +212,6 @@ pub enum MergingValuesError { IoError(#[from] io::Error), } -impl ChromValues for MergingValues { - type Value = Value; - type Error = MergingValuesError; - - fn next(&mut self) -> Option> { - match self.iter.next() { - Some(Ok(v)) => Some(Ok(v)), - Some(Err(e)) => Some(Err(e.into())), - None => None, - } - } - - fn peek(&mut self) -> Option> { - match self.iter.peek() { - Some(Ok(v)) => Some(Ok(v)), - Some(Err(err)) => Some(Err(err)), - None => None, - } - } -} - pub fn get_merged_vals( bigwigs: Vec>, max_zooms: usize, @@ -323,8 +306,12 @@ pub fn get_merged_vals( let chunk = vals.by_ref().take(max_bw_fds).collect::>(); let mut mergingvalues = MergingValues::new(chunk, threshold, adjust, clip); let (sender, receiver) = unbounded::(); - while let Some(val) = mergingvalues.next() { - let val = val?; + loop { + let val = match mergingvalues.iter.next() { + Some(Ok(v)) => v, + Some(Err(e)) => Err(e)?, + None => break, + }; sender.send(val).unwrap(); } @@ -379,14 +366,16 @@ impl ChromData for ChromGroupReadImpl { Some(Ok((chrom, _, mut group))) => { let mut p = start_processing(chrom)?; - while let Some(current_val) = group.next() { - // If there is a source error, propogate that up - let current_val = current_val.map_err(ProcessChromError::SourceError)?; - let next_val = match group.peek() { - None | Some(Err(_)) => None, + loop { + let current_val = match group.iter.next() { + Some(Ok(v)) => v, + Some(Err(e)) => Err(ProcessChromError::SourceError(e))?, + None => break, + }; + let next_val = match group.iter.peek() { Some(Ok(v)) => Some(v), + Some(Err(_)) | None => None, }; - let read = p.do_process(current_val, next_val); runtime.block_on(read)?; } diff --git a/bigtools/src/utils/mod.rs b/bigtools/src/utils/mod.rs index 9ca2629..012aeb9 100644 --- a/bigtools/src/utils/mod.rs +++ b/bigtools/src/utils/mod.rs @@ -1,4 +1,3 @@ -pub mod chromvalues; pub mod file; pub mod fill; pub mod idmap; diff --git a/bigtools/tests/bigbedwrite.rs b/bigtools/tests/bigbedwrite.rs index 2ef1216..c3164fb 100644 --- a/bigtools/tests/bigbedwrite.rs +++ b/bigtools/tests/bigbedwrite.rs @@ -12,7 +12,6 @@ fn bigbedwrite_test() -> Result<(), Box> { use tempfile; use bigtools::bed::bedparser::BedParser; - use bigtools::utils::chromvalues::ChromValues; use bigtools::{BigBedRead, BigBedWrite}; let mut dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")); @@ -39,7 +38,7 @@ fn bigbedwrite_test() -> Result<(), Box> { let mut outb = BigBedWrite::create_file(tempfile.path().to_string_lossy().to_string()); outb.autosql = { let (_, mut group) = vals_iter.next_chrom().unwrap().unwrap(); - let first = group.peek().unwrap().unwrap(); + let first = group.peek_val().unwrap(); Some(bigtools::bed::autosql::bed_autosql(&first.rest)) }; outb.options.compress = false; diff --git a/bigtools/tests/bigwigwrite.rs b/bigtools/tests/bigwigwrite.rs index 07c9e0c..56e2a81 100644 --- a/bigtools/tests/bigwigwrite.rs +++ b/bigtools/tests/bigwigwrite.rs @@ -8,7 +8,6 @@ use tempfile; use bigtools::bed::bedparser::BedParser; use bigtools::bedchromdata::BedParserStreamingIterator; -use bigtools::utils::chromvalues::ChromValues; use bigtools::{BigWigRead, BigWigWrite, Value}; use tokio::runtime; From 28c2425c092cb0fcfb55a79b71961483729041d8 Mon Sep 17 00:00:00 2001 From: Jack Huey <31162821+jackh726@users.noreply.github.com> Date: Sun, 21 Apr 2024 02:51:36 -0400 Subject: [PATCH 14/31] Remove BedParser and fold logic instead in process_to_bbi. Also fix chrom indexing --- bigtools/src/bbi/bedchromdata.rs | 265 ++++++++----- bigtools/src/bbi/bigwigwrite.rs | 6 +- bigtools/src/bed/bedparser.rs | 436 ++------------------- bigtools/src/bed/indexer.rs | 44 ++- bigtools/src/utils/cli/bedgraphtobigwig.rs | 26 +- bigtools/src/utils/cli/bedtobigbed.rs | 28 +- bigtools/tests/bigbedwrite.rs | 20 +- bigtools/tests/bigwigwrite.rs | 23 +- pybigtools/src/lib.rs | 7 +- 9 files changed, 276 insertions(+), 579 deletions(-) diff --git a/bigtools/src/bbi/bedchromdata.rs b/bigtools/src/bbi/bedchromdata.rs index 8a2f0d2..2e25beb 100644 --- a/bigtools/src/bbi/bedchromdata.rs +++ b/bigtools/src/bbi/bedchromdata.rs @@ -1,42 +1,90 @@ //! The types here (`BedParserStreamingIterator` and `BedParserParallelStreamingIterator`) -//! ultimately wrap around the a `BedParser` to interface with bigWig and bigBed writing. +//! process incoming bed-like data and process into bigWig and bigBed files. //! -//! `BedParserStreamingIterator` is a thin wrapper, which only really has extra checking -//! for out of order chromosomes. -//! -//! `BedParserParallelStreamingIterator` is a more complicated wrapper that will queue up -//! to 4 extra chromosomes to be processed concurrently. +//! `BedParserStreamingIterator` processes the data serially, checking for out +//! of order chromosomes. `BedParserParallelStreamingIterator`, on the other +//! hand, is more complicated wrapper and will queue up to 4 extra chromosomes +//! to be processed concurrently. use std::collections::VecDeque; use std::fs::File; -use std::io::BufReader; +use std::io::{BufReader, Read}; use std::path::PathBuf; use tokio::runtime::Runtime; use crate::bed::bedparser::{ - BedChromData, BedFileStream, BedParser, BedValueError, Parser, StateValue, StreamingBedValues, + parse_bed, parse_bedgraph, BedFileStream, BedInfallibleIteratorStream, BedIteratorStream, + BedValueError, Parser, StreamingBedValues, }; use crate::utils::file_view::FileView; use crate::utils::streaming_linereader::StreamingLineReader; -use crate::{ChromData, ChromProcess, ProcessChromError}; +use crate::{BedEntry, ChromData, ChromProcess, ProcessChromError, Value}; pub struct BedParserStreamingIterator { - bed_data: BedParser, + bed_data: S, allow_out_of_order_chroms: bool, - last_chrom: Option, } impl BedParserStreamingIterator { - pub fn new(bed_data: BedParser, allow_out_of_order_chroms: bool) -> Self { + pub fn new(bed_data: S, allow_out_of_order_chroms: bool) -> Self { BedParserStreamingIterator { bed_data, allow_out_of_order_chroms, - last_chrom: None, } } } +impl BedParserStreamingIterator>> { + pub fn from_bed_file(file: R, allow_out_of_order_chroms: bool) -> Self { + BedParserStreamingIterator::new( + BedFileStream { + bed: StreamingLineReader::new(BufReader::new(file)), + parse: parse_bed, + }, + allow_out_of_order_chroms, + ) + } +} + +impl BedParserStreamingIterator>> { + pub fn from_bedgraph_file(file: R, allow_out_of_order_chroms: bool) -> Self { + BedParserStreamingIterator::new( + BedFileStream { + bed: StreamingLineReader::new(BufReader::new(file)), + parse: parse_bedgraph, + }, + allow_out_of_order_chroms, + ) + } +} + +impl< + V: Clone, + E: Into, + C: Into + for<'a> PartialEq<&'a str>, + I: Iterator>, + > BedParserStreamingIterator> +{ + pub fn wrap_iter(iter: I, allow_out_of_order_chroms: bool) -> Self { + BedParserStreamingIterator::new( + BedIteratorStream { iter, curr: None }, + allow_out_of_order_chroms, + ) + } +} + +impl + for<'a> PartialEq<&'a str>, I: Iterator> + BedParserStreamingIterator> +{ + pub fn wrap_infallible_iter(iter: I, allow_out_of_order_chroms: bool) -> Self { + BedParserStreamingIterator::new( + BedInfallibleIteratorStream { iter, curr: None }, + allow_out_of_order_chroms, + ) + } +} + impl ChromData for BedParserStreamingIterator { type Value = S::Value; type Error = BedValueError; @@ -51,43 +99,79 @@ impl ChromData for BedParserStreamingIterator { start_processing: &mut StartProcessing, advance: &mut Advance, ) -> Result<(), ProcessChromError> { + let mut state: Option<(String, P, Option>)> = None; + loop { - match self.bed_data.next_chrom() { - Some(Ok((chrom, mut group))) => { - // First, if we don't want to allow out of order chroms, error here - let last = self.last_chrom.replace(chrom.clone()); - if let Some(c) = last { - // TODO: test this correctly fails - if !self.allow_out_of_order_chroms && c >= chrom { - return Err(ProcessChromError::SourceError(BedValueError::InvalidInput("Input bedGraph not sorted by chromosome. Sort with `sort -k1,1 -k2,2n`.".to_string()))); - } + let (curr_value, new_state) = match state { + Some((c, p, Some(v))) => (Some(v), Some((c, p))), + Some((c, p, None)) => (self.bed_data.next(), Some((c, p))), + None => (self.bed_data.next(), None), + }; + state = match (new_state, curr_value) { + // The next value is an error, but we never started + (None, Some(Err(e))) => return Err(ProcessChromError::SourceError(e)), + // There are no values at all + (None, None) => return Ok(()), + // There are no more values + (Some(state), None) => { + advance(state.1)?; + return Ok(()); + } + // The next value is an error and we have seen values before + (Some(state), Some(Err(e))) => { + // We *can* do anything since we've encountered an error. + // We'll go ahead and try to finish what we can, before we return. + advance(state.1)?; + return Err(ProcessChromError::SourceError(e)); + } + // The next value is the first + (None, Some(Ok((chrom, val)))) => { + let chrom = chrom.to_string(); + let mut p = start_processing(chrom.clone())?; + let next_val = self.bed_data.next(); + let next_value = match &next_val { + Some(Ok(v)) if v.0 == chrom => Some(&v.1), + _ => None, + }; + runtime.block_on(p.do_process(val, next_value))?; + Some((chrom, p, next_val)) + } + // The next value is the same chromosome + (Some((prev_chrom, mut p)), Some(Ok((chrom, val)))) if chrom == &prev_chrom => { + let next_val = self.bed_data.next(); + let next_value = match &next_val { + Some(Ok(v)) if v.0 == prev_chrom => Some(&v.1), + _ => None, + }; + runtime.block_on(p.do_process(val, next_value))?; + Some((prev_chrom, p, next_val)) + } + // The next value is a different chromosome + (Some((prev_chrom, p)), Some(Ok((chrom, val)))) => { + // TODO: test this correctly fails + if !self.allow_out_of_order_chroms && prev_chrom.as_str() >= chrom { + return Err(ProcessChromError::SourceError(BedValueError::InvalidInput("Input bedGraph not sorted by chromosome. Sort with `sort -k1,1 -k2,2n`.".to_string()))); } + advance(p)?; - let mut p = start_processing(chrom)?; - - while let Some(current_val) = group.next() { - // If there is a source error, propogate that up - let current_val = current_val.map_err(ProcessChromError::SourceError)?; - let next_val = group.peek_val(); - - let read = p.do_process(current_val, next_val); - runtime.block_on(read)?; - } + let chrom = chrom.to_string(); + let mut p = start_processing(chrom.clone())?; + let next_val = self.bed_data.next(); + let next_value = match &next_val { + Some(Ok(v)) if v.0 == chrom => Some(&v.1), + _ => None, + }; - advance(p)?; + runtime.block_on(p.do_process(val, next_value))?; + Some((chrom, p, next_val)) } - Some(Err(e)) => return Err(ProcessChromError::SourceError(e)), - None => break, - } + }; } - - Ok(()) } } pub struct BedParserParallelStreamingIterator { allow_out_of_order_chroms: bool, - last_chrom: Option, chrom_indices: Vec<(u64, String)>, parse_fn: Parser, @@ -107,7 +191,6 @@ impl BedParserParallelStreamingIterator { BedParserParallelStreamingIterator { allow_out_of_order_chroms, - last_chrom: None, chrom_indices, parse_fn, @@ -135,55 +218,64 @@ impl ChromData for BedParserParallelStreamingIterator { loop { while remaining && queued_reads.len() < (4 + 1) { let (curr, next) = match self.chrom_indices.pop() { - Some(c) => (c, self.chrom_indices.get(0)), + Some(c) => (c, self.chrom_indices.last()), None => { remaining = false; break; } }; + next.map(|n| assert!(curr.1 != n.1)); + // TODO: test this correctly fails + if !self.allow_out_of_order_chroms && next.map(|n| curr.1 > n.1).unwrap_or(false) { + return Err(ProcessChromError::SourceError(BedValueError::InvalidInput( + "Input bedGraph not sorted by chromosome. Sort with `sort -k1,1 -k2,2n`." + .to_string(), + ))); + } let file = match File::open(&self.path) { Ok(f) => f, Err(err) => return Err(ProcessChromError::SourceError(err.into())), }; let file = FileView::new(file, curr.0, next.map(|n| n.0).unwrap_or(u64::MAX))?; - let mut parser = BedParser::new(BedFileStream { + let mut stream = BedFileStream { bed: StreamingLineReader::new(BufReader::new(file)), parse: self.parse_fn, - }); + }; - match parser.next_chrom() { - Some(Ok((chrom, mut group))) => { - let last = self.last_chrom.replace(chrom.clone()); - if let Some(c) = last { - // TODO: test this correctly fails - if !self.allow_out_of_order_chroms && c >= chrom { - return Err(ProcessChromError::SourceError(BedValueError::InvalidInput("Input bedGraph not sorted by chromosome. Sort with `sort -k1,1 -k2,2n`.".to_string()))); - } + let mut p = start_processing(curr.1.clone())?; + let curr_chrom = curr.1.clone(); + let data: tokio::task::JoinHandle>> = + runtime.spawn(async move { + let mut next_val: Option> = None; + + loop { + let curr_value = match next_val.take() { + Some(v) => Some(v), + None => stream.next(), + }; + next_val = match curr_value { + // The next value is an error + Some(Err(e)) => return Err(ProcessChromError::SourceError(e)), + None => return Ok(p), + Some(Ok((chrom, _))) if chrom != curr_chrom => { + return Err(ProcessChromError::InvalidInput( + "File is not sorted.".to_string(), + )); + } + Some(Ok((_, val))) => { + let next_val = stream.next(); + let next_value = match &next_val { + Some(Ok(v)) if v.0 == curr_chrom => Some(&v.1), + _ => None, + }; + p.do_process(val, next_value).await?; + next_val + } + }; } - - let mut p = start_processing(chrom)?; - let data: tokio::task::JoinHandle< - Result>, - > = runtime.spawn(async move { - while let Some(current_val) = group.next() { - // If there is a source error, propogate that up - let current_val = - current_val.map_err(ProcessChromError::SourceError)?; - let next_val = group.peek_val(); - - let read = p.do_process::(current_val, next_val); - read.await?; - } - Ok(p) - }); - queued_reads.push_back(data); - } - Some(Err(e)) => return Err(ProcessChromError::SourceError(e)), - None => { - panic!("Unexpected end of file") - } - } + }); + queued_reads.push_back(data); } let Some(next_chrom) = queued_reads.pop_front() else { break; @@ -196,31 +288,6 @@ impl ChromData for BedParserParallelStreamingIterator { } } -impl BedChromData { - pub fn next(&mut self) -> Option> { - let state = self.load_state()?; - let ret = state.load_state_and_take_value(); - if matches!(state.state_value, StateValue::DiffChrom(..)) { - self.done = true; - } - ret - } - - pub fn peek_val(&mut self) -> Option<&S::Value> { - let state = self.load_state()?; - state.load_state(false); - let ret = match &state.state_value { - StateValue::Empty => None, - StateValue::Value(_, val) => Some(val), - StateValue::EmptyValue(_) => None, // Shouldn't occur - StateValue::DiffChrom(_, _) => None, // Only `Value` is peekable - StateValue::Error(_) => None, - StateValue::Done => None, - }; - ret - } -} - #[cfg(all(test, feature = "write"))] mod tests { use super::*; diff --git a/bigtools/src/bbi/bigwigwrite.rs b/bigtools/src/bbi/bigwigwrite.rs index 27719b3..ffaf718 100644 --- a/bigtools/src/bbi/bigwigwrite.rs +++ b/bigtools/src/bbi/bigwigwrite.rs @@ -9,16 +9,14 @@ Provides the interface for writing bigWig files. # use std::fs::File; # use bigtools::BigWigWrite; # use bigtools::bedchromdata::BedParserStreamingIterator; -# use bigtools::bed::bedparser::BedParser; # fn main() -> Result<(), Box> { # let mut dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")); # dir.push("resources/test"); # let mut bedgraph_in = dir.clone(); # bedgraph_in.push("single_chrom.bedGraph"); -// First, set up our input data. Here, we're using the `BedParserStreamingIterator` with a `BedParser`. +// First, set up our input data. Here, we're using the `BedParserStreamingIterator`. let bedgraph_file: File = File::open(bedgraph_in)?; -let vals_iter = BedParser::from_bedgraph_file(bedgraph_file); -let vals = BedParserStreamingIterator::new(vals_iter, false); +let vals = BedParserStreamingIterator::from_bedgraph_file(bedgraph_file, false); // Then, we need to know what the chromosome sizes are. This can be read in from a file, but here we // just construct a map for ease. diff --git a/bigtools/src/bed/bedparser.rs b/bigtools/src/bed/bedparser.rs index ef251f8..775603f 100644 --- a/bigtools/src/bed/bedparser.rs +++ b/bigtools/src/bed/bedparser.rs @@ -10,9 +10,7 @@ //! from one chromosome to another. The is important because bigwig/bigbed writing is "chunked" by chromosome. use std::io::{self, BufRead, BufReader, Read}; -use std::sync::Arc; -use crossbeam_utils::atomic::AtomicCell; use thiserror::Error; use crate::bbi::{BedEntry, Value}; @@ -104,6 +102,24 @@ pub struct BedFileStream { pub parse: Parser, } +impl BedFileStream> { + pub fn from_bed_file(file: R) -> BedFileStream> { + BedFileStream { + bed: StreamingLineReader::new(BufReader::new(file)), + parse: parse_bed, + } + } +} + +impl BedFileStream> { + pub fn from_bedgraph_file(file: R) -> BedFileStream> { + BedFileStream { + bed: StreamingLineReader::new(BufReader::new(file)), + parse: parse_bedgraph, + } + } +} + impl StreamingBedValues for BedFileStream { type Value = V; @@ -122,8 +138,8 @@ impl StreamingBedValues for BedFileStream { // Wraps a bed-like Iterator pub struct BedIteratorStream { - iter: I, - curr: Option<(String, V)>, + pub(crate) iter: I, + pub(crate) curr: Option<(String, V)>, } impl< @@ -154,8 +170,8 @@ impl< // Wraps a bed-like Iterator pub struct BedInfallibleIteratorStream { - iter: I, - curr: Option<(String, V)>, + pub(crate) iter: I, + pub(crate) curr: Option<(String, V)>, } impl + for<'a> PartialEq<&'a str>, I: Iterator> @@ -178,411 +194,3 @@ impl + for<'a> PartialEq<&'a str>, I: Iterator { - state: Arc>>>, -} - -/// Defines the internal states of bed parsing -pub(crate) enum StateValue { - // No value has been loaded yet - Empty, - // A value has been loaded without error - // Contains the current chromosome and the value. - Value(String, V), - // A previously loaded value was taken. - // Contains the current chromosome. - EmptyValue(String), - // A new chromsome has been loaded - DiffChrom(String, V), - // An error has been seen - Error(BedValueError), - // We are done, either because we have run out of values or because of an error - Done, -} - -impl std::fmt::Debug for StateValue { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - Self::Empty => write!(f, "Empty"), - Self::Value(arg0, _) => f.debug_tuple("Value").field(arg0).finish(), - Self::EmptyValue(arg0) => f.debug_tuple("EmptyValue").field(arg0).finish(), - Self::DiffChrom(arg0, _) => f.debug_tuple("DiffChrom").field(arg0).finish(), - Self::Error(arg0) => f.debug_tuple("Error").field(arg0).finish(), - Self::Done => write!(f, "Done"), - } - } -} - -impl StateValue { - fn take_error(&mut self) -> Option { - let v = std::mem::replace(self, StateValue::Done); - let ret; - (*self, ret) = match v { - StateValue::Error(e) => (StateValue::Done, Some(e)), - s => (s, None), - }; - ret - } - - fn active_chrom(&self) -> Option<&String> { - match self { - StateValue::Empty => None, - StateValue::Value(c, _) => Some(c), - StateValue::EmptyValue(c) => Some(c), - StateValue::DiffChrom(c, _) => Some(c), - StateValue::Error(_) => None, - StateValue::Done => None, - } - } -} - -#[derive(Debug)] -pub(crate) struct BedParserState { - stream: S, - pub(crate) state_value: StateValue, -} - -impl BedParser { - pub fn new(stream: S) -> Self { - let state = BedParserState { - stream, - state_value: StateValue::Empty, - }; - BedParser { - state: Arc::new(AtomicCell::new(Some(state))), - } - } -} - -impl BedParser>> { - pub fn from_bed_file(file: R) -> Self { - BedParser::new(BedFileStream { - bed: StreamingLineReader::new(BufReader::new(file)), - parse: parse_bed, - }) - } -} - -impl BedParser>> { - pub fn from_bedgraph_file(file: R) -> Self { - BedParser::new(BedFileStream { - bed: StreamingLineReader::new(BufReader::new(file)), - parse: parse_bedgraph, - }) - } -} - -impl< - V: Clone, - E: Into, - C: Into + for<'a> PartialEq<&'a str>, - I: Iterator>, - > BedParser> -{ - pub fn wrap_iter(iter: I) -> Self { - BedParser::new(BedIteratorStream { iter, curr: None }) - } -} - -impl + for<'a> PartialEq<&'a str>, I: Iterator> - BedParser> -{ - pub fn wrap_infallible_iter(iter: I) -> Self { - BedParser::new(BedInfallibleIteratorStream { iter, curr: None }) - } -} - -impl BedParser { - // This is *valid* to call multiple times for the same chromosome (assuming the - // `BedChromData` has been dropped), since calling this function doesn't - // actually advance the state (it will only set `next_val` if it currently is none). - pub fn next_chrom(&mut self) -> Option), BedValueError>> { - let mut state = self.state.swap(None).expect("Invalid usage. This iterator does not buffer and all values should be exhausted for a chrom before next() is called."); - state.load_state(true); - let error = state.state_value.take_error(); - let chrom = state.state_value.active_chrom().cloned(); - self.state.swap(Some(state)); - - if let Some(e) = error { - return Some(Err(e)); - } - - match chrom { - Some(chrom) => { - let group = BedChromData { - state: self.state.clone(), - curr_state: None, - done: false, - }; - Some(Ok((chrom.to_owned(), group))) - } - None => None, - } - } -} - -impl BedParserState { - pub(crate) fn load_state(&mut self, switch_chrom: bool) { - let state_value = std::mem::replace(&mut self.state_value, StateValue::Empty); - self.state_value = match state_value { - StateValue::Empty => match self.stream.next() { - None => StateValue::Done, - Some(Ok((chrom, val))) => StateValue::Value(chrom.to_owned(), val), - Some(Err(err)) => StateValue::Error(err), - }, - StateValue::Value(c, v) => StateValue::Value(c, v), - StateValue::EmptyValue(prev_chrom) => match self.stream.next() { - None => StateValue::Done, - Some(Ok((chrom, val))) if switch_chrom || prev_chrom == chrom => { - StateValue::Value(prev_chrom, val) - } - Some(Ok((chrom, val))) => StateValue::DiffChrom(chrom.to_owned(), val), - Some(Err(err)) => StateValue::Error(err), - }, - StateValue::DiffChrom(c, v) if switch_chrom => StateValue::Value(c, v), - StateValue::DiffChrom(c, v) => StateValue::DiffChrom(c, v), - StateValue::Error(e) => StateValue::Error(e), - StateValue::Done => StateValue::Done, - }; - // For sanity, if we're switching chromosomes then we should never have an empty value or say we're in a "different" chromosome - debug_assert!( - !(switch_chrom - && matches!( - &self.state_value, - StateValue::Empty | StateValue::EmptyValue(..) | StateValue::DiffChrom(..) - )), - ); - } - - pub(crate) fn load_state_and_take_value(&mut self) -> Option> { - let state_value = std::mem::replace(&mut self.state_value, StateValue::Empty); - let ret; - (self.state_value, ret) = match state_value { - StateValue::Empty => match self.stream.next() { - None => (StateValue::Done, None), - Some(Ok((chrom, val))) => (StateValue::EmptyValue(chrom.to_owned()), Some(Ok(val))), - Some(Err(err)) => (StateValue::Done, Some(Err(err))), - }, - StateValue::Value(c, v) => (StateValue::EmptyValue(c), Some(Ok(v))), - StateValue::EmptyValue(prev_chrom) => match self.stream.next() { - None => (StateValue::Done, None), - Some(Ok((chrom, val))) if prev_chrom == chrom => { - (StateValue::EmptyValue(prev_chrom), Some(Ok(val))) - } - Some(Ok((chrom, val))) => (StateValue::DiffChrom(chrom.to_owned(), val), None), - Some(Err(err)) => (StateValue::Done, Some(Err(err))), - }, - StateValue::DiffChrom(c, v) => (StateValue::DiffChrom(c, v), None), - StateValue::Error(e) => (StateValue::Done, Some(Err(e))), - StateValue::Done => (StateValue::Done, None), - }; - // For sanity, we shouldn't have any error or value (for the current chromosome) stored - debug_assert!(matches!( - &self.state_value, - StateValue::Done | StateValue::EmptyValue(..) | StateValue::DiffChrom(..) - )); - ret - } -} - -// The separation here between the "current" state and the shared state comes -// from the observation that once we *start* on a chromosome, we can't move on -// to the next until we've exhausted the current. In this *particular* -// implementation, we don't allow parallel iteration of chromsomes. So, the -// state is either needed *here* or in the main struct. -pub struct BedChromData { - state: Arc>>>, - curr_state: Option>, - pub(crate) done: bool, -} - -impl BedChromData { - pub(crate) fn load_state(&mut self) -> Option<&mut BedParserState> { - if self.done { - return None; - } - if self.curr_state.is_none() { - let opt_state = self.state.swap(None); - if opt_state.is_none() { - panic!("Invalid usage. This iterator does not buffer and all values should be exhausted for a chrom before next() is called."); - } - self.curr_state = opt_state; - } - Some(self.curr_state.as_mut().unwrap()) - } -} - -impl Drop for BedChromData { - fn drop(&mut self) { - if let Some(state) = self.curr_state.take() { - self.state.swap(Some(state)); - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - use std::fs::File; - use std::path::PathBuf; - - #[test] - fn test_bed_works() { - let mut dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - dir.push("resources/test"); - dir.push("small.bed"); - let f = File::open(dir).unwrap(); - let mut bgp = BedParser::from_bed_file(f); - macro_rules! check_value { - ($c:ident $chrom:literal) => { - assert_eq!($c, $chrom); - }; - (peek next $group:expr, $start:literal $end:literal $rest:expr) => { - check_value!(peek $group, $start $end $rest); - check_value!(next $group, $start $end $rest); - }; - (peek $group:expr, $start:literal $end:literal $rest:expr) => { - assert_eq!( - &BedEntry { - start: $start, - end: $end, - rest: $rest.to_string() - }, - $group.peek_val().unwrap() - ); - }; - (next $group:expr, $start:literal $end:literal $rest:expr) => { - assert_eq!( - BedEntry { - start: $start, - end: $end, - rest: $rest.to_string() - }, - $group.next().unwrap().unwrap() - ); - }; - } - { - let (chrom, mut group) = bgp.next_chrom().unwrap().unwrap(); - check_value!(chrom "chr17"); - check_value!(peek group, 1 100 "test1\t0"); - } - { - let (chrom, mut group) = bgp.next_chrom().unwrap().unwrap(); - check_value!(chrom "chr17"); - check_value!(peek group, 1 100 "test1\t0"); - } - { - let (chrom, mut group) = bgp.next_chrom().unwrap().unwrap(); - check_value!(chrom "chr17"); - check_value!(peek next group, 1 100 "test1\t0"); - check_value!(peek next group, 101 200 "test2\t0"); - check_value!(peek next group, 201 300 "test3\t0"); - assert!(group.peek_val().is_none()); - - assert!(group.next().is_none()); - assert!(group.peek_val().is_none()); - } - { - let (chrom, mut group) = bgp.next_chrom().unwrap().unwrap(); - check_value!(chrom "chr18"); - check_value!(peek next group, 1 100 "test4\t0"); - check_value!(peek next group, 101 200 "test5\t0"); - assert!(group.peek_val().is_none()); - - assert!(group.next().is_none()); - assert!(group.peek_val().is_none()); - } - { - let (chrom, mut group) = bgp.next_chrom().unwrap().unwrap(); - check_value!(chrom "chr19"); - check_value!(peek next group, 1 100 "test6\t0"); - assert!(group.peek_val().is_none()); - - assert!(group.next().is_none()); - assert!(group.peek_val().is_none()); - } - assert!(matches!(bgp.next_chrom(), None)); - } - - #[test] - fn test_bedgraph_works() { - let mut dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - dir.push("resources/test"); - dir.push("small.bedGraph"); - let f = File::open(dir).unwrap(); - let mut bgp = BedParser::from_bedgraph_file(f); - macro_rules! check_value { - ($c:ident $chrom:literal) => { - assert_eq!($c, $chrom); - }; - (peek next $group:expr, $start:literal $end:literal) => { - check_value!(peek $group, $start $end); - check_value!(next $group, $start $end); - }; - (peek $group:expr, $start:literal $end:literal) => { - assert_eq!( - &Value { - start: $start, - end: $end, - value: 0.5, - }, - $group.peek_val().unwrap() - ); - }; - (next $group:expr, $start:literal $end:literal) => { - assert_eq!( - Value { - start: $start, - end: $end, - value: 0.5, - }, - $group.next().unwrap().unwrap() - ); - }; - } - { - let (chrom, mut group) = bgp.next_chrom().unwrap().unwrap(); - check_value!(chrom "chr17"); - check_value!(peek group, 1 100); - } - { - let (chrom, mut group) = bgp.next_chrom().unwrap().unwrap(); - check_value!(chrom "chr17"); - check_value!(peek group, 1 100); - } - { - let (chrom, mut group) = bgp.next_chrom().unwrap().unwrap(); - check_value!(chrom "chr17"); - check_value!(peek next group, 1 100); - check_value!(peek next group, 101 200); - check_value!(peek next group, 201 300); - assert!(group.peek_val().is_none()); - - assert!(group.next().is_none()); - assert!(group.peek_val().is_none()); - } - { - let (chrom, mut group) = bgp.next_chrom().unwrap().unwrap(); - check_value!(chrom "chr18"); - check_value!(peek next group, 1 100); - check_value!(peek next group, 101 200); - assert!(group.peek_val().is_none()); - - assert!(group.next().is_none()); - assert!(group.peek_val().is_none()); - } - { - let (chrom, mut group) = bgp.next_chrom().unwrap().unwrap(); - check_value!(chrom "chr19"); - check_value!(peek next group, 1 100); - assert!(group.peek_val().is_none()); - - assert!(group.next().is_none()); - assert!(group.peek_val().is_none()); - } - assert!(matches!(bgp.next_chrom(), None)); - } -} diff --git a/bigtools/src/bed/indexer.rs b/bigtools/src/bed/indexer.rs index 5b33aa1..47f83d8 100644 --- a/bigtools/src/bed/indexer.rs +++ b/bigtools/src/bed/indexer.rs @@ -99,14 +99,30 @@ pub fn index_chroms(file: File) -> io::Result>> { let curr = chroms.insert_after(prev, (tell, chrom)).unwrap(); - if chroms[curr].1 != chroms[prev].1 && tell < next_tell { + let left = chroms[curr].1 != chroms[prev].1 && tell < next_tell; + let right = next + .map(|next| chroms[curr].1 != chroms[next].1 && tell < chroms[next].0) + .unwrap_or(true); + + if left { do_index(file_size, file, chroms, line, prev, Some(curr), limit - 1)?; } - if next.map(|next| tell < chroms[next].0).unwrap_or(true) { + if right { do_index(file_size, file, chroms, line, curr, next, limit - 1)?; } + if chroms[curr].1 != chroms[prev].1 && tell == next_tell { + file.seek(SeekFrom::Start(chroms[prev].0))?; + line.clear(); + file.read_line(line)?; + line.clear(); + let tell = file.tell()?; + file.read_line(line)?; + let chrom = parse_line(&*line)? + .expect("Bad logic. Must at least find last entry for chromosome."); + chroms.insert_after(prev, (tell, chrom)).unwrap(); + } Ok(()) } @@ -132,6 +148,30 @@ pub fn index_chroms(file: File) -> io::Result>> { Ok(Some(chroms)) } +#[allow(unused)] +fn linear_index(reader: &mut BufReader) -> io::Result> { + reader.seek(SeekFrom::Start(0))?; + let mut chroms: Vec<(u64, String)> = vec![]; + loop { + let mut line = String::new(); + let tell = reader.seek(SeekFrom::Current(0))?; + let read = reader.read_line(&mut line)?; + if read == 0 { + break; + } + + let chrom = line + .split_ascii_whitespace() + .next() + .ok_or(io::Error::from(io::ErrorKind::InvalidData))?; + if chroms.last().map(|c| &c.1 != chrom).unwrap_or(true) { + chroms.push((tell, chrom.to_string())); + } + line.clear(); + } + Ok(chroms) +} + #[cfg(test)] mod tests { use super::*; diff --git a/bigtools/src/utils/cli/bedgraphtobigwig.rs b/bigtools/src/utils/cli/bedgraphtobigwig.rs index 867e3ec..0eef7a3 100644 --- a/bigtools/src/utils/cli/bedgraphtobigwig.rs +++ b/bigtools/src/utils/cli/bedgraphtobigwig.rs @@ -4,7 +4,7 @@ use std::{collections::HashMap, error::Error, fs::File, path::PathBuf}; use clap::Parser; use tokio::runtime; -use crate::bed::bedparser::{parse_bedgraph, BedParser}; +use crate::bed::bedparser::parse_bedgraph; use crate::bed::indexer::index_chroms; use crate::bedchromdata::{BedParserParallelStreamingIterator, BedParserStreamingIterator}; use crate::{BigWigWrite, InputSortType}; @@ -99,10 +99,8 @@ pub fn bedgraphtobigwig(args: BedGraphToBigWigArgs) -> Result<(), Box let allow_out_of_order_chroms = !matches!(outb.options.input_sort_type, InputSortType::ALL); if bedgraphpath == "-" || bedgraphpath == "stdin" { let stdin = std::io::stdin().lock(); - let vals_iter = BedParser::from_bedgraph_file(stdin); - - let chsi = BedParserStreamingIterator::new(vals_iter, allow_out_of_order_chroms); - outb.write(chrom_map, chsi, runtime)?; + let vals = BedParserStreamingIterator::from_bedgraph_file(stdin, allow_out_of_order_chroms); + outb.write(chrom_map, vals, runtime)?; } else { let infile = File::open(&bedgraphpath)?; let (parallel, parallel_required) = match (nthreads, args.parallel.as_ref()) { @@ -161,19 +159,19 @@ pub fn bedgraphtobigwig(args: BedGraphToBigWigArgs) -> Result<(), Box } else { let infile = File::open(&bedgraphpath)?; if args.single_pass { - let vals_iter = BedParser::from_bedgraph_file(infile); - - let chsi = BedParserStreamingIterator::new(vals_iter, allow_out_of_order_chroms); - outb.write(chrom_map, chsi, runtime)?; + let vals = BedParserStreamingIterator::from_bedgraph_file( + infile, + allow_out_of_order_chroms, + ); + outb.write(chrom_map, vals, runtime)?; } else { outb.write_multipass( || { let infile = File::open(&bedgraphpath)?; - let vals_iter = BedParser::from_bedgraph_file(infile); - let chsi = - BedParserStreamingIterator::new(vals_iter, allow_out_of_order_chroms); - - Ok(chsi) + Ok(BedParserStreamingIterator::from_bedgraph_file( + infile, + allow_out_of_order_chroms, + )) }, chrom_map, runtime, diff --git a/bigtools/src/utils/cli/bedtobigbed.rs b/bigtools/src/utils/cli/bedtobigbed.rs index 28fa4fd..c238047 100644 --- a/bigtools/src/utils/cli/bedtobigbed.rs +++ b/bigtools/src/utils/cli/bedtobigbed.rs @@ -7,12 +7,10 @@ use std::path::PathBuf; use clap::Parser; use tokio::runtime; -use crate::bed::bedparser::parse_bed; +use crate::bed::bedparser::{parse_bed, BedFileStream, StreamingBedValues}; use crate::bed::indexer::index_chroms; use crate::bedchromdata::BedParserParallelStreamingIterator; -use crate::{ - bed::bedparser::BedParser, bedchromdata::BedParserStreamingIterator, BigBedWrite, InputSortType, -}; +use crate::{bedchromdata::BedParserStreamingIterator, BigBedWrite, InputSortType}; use super::BBIWriteArgs; @@ -108,10 +106,8 @@ pub fn bedtobigbed(args: BedToBigBedArgs) -> Result<(), Box> { let autosql = match args.autosql.as_ref() { None => { let infile = File::open(&bedpath)?; - let mut vals_iter = BedParser::from_bed_file(infile); - let (_, mut group) = vals_iter.next_chrom().unwrap().unwrap(); - let first = group.peek_val().unwrap(); - crate::bed::autosql::bed_autosql(&first.rest) + let mut vals_iter = BedFileStream::from_bed_file(infile); + crate::bed::autosql::bed_autosql(&vals_iter.next().unwrap().unwrap().1.rest) } Some(file) => std::fs::read_to_string(file)?, }; @@ -120,9 +116,7 @@ pub fn bedtobigbed(args: BedToBigBedArgs) -> Result<(), Box> { let allow_out_of_order_chroms = !matches!(outb.options.input_sort_type, InputSortType::ALL); if bedpath == "-" || bedpath == "stdin" { let stdin = std::io::stdin().lock(); - let vals_iter = BedParser::from_bed_file(stdin); - - let chsi = BedParserStreamingIterator::new(vals_iter, allow_out_of_order_chroms); + let chsi = BedParserStreamingIterator::from_bed_file(stdin, allow_out_of_order_chroms); outb.write(chrom_map, chsi, runtime)?; } else { let infile = File::open(&bedpath)?; @@ -182,17 +176,17 @@ pub fn bedtobigbed(args: BedToBigBedArgs) -> Result<(), Box> { } else { let infile = File::open(&bedpath)?; if args.single_pass { - let vals_iter = BedParser::from_bed_file(infile); - - let chsi = BedParserStreamingIterator::new(vals_iter, allow_out_of_order_chroms); + let chsi = + BedParserStreamingIterator::from_bed_file(infile, allow_out_of_order_chroms); outb.write(chrom_map, chsi, runtime)?; } else { outb.write_multipass( || { let infile = File::open(&bedpath)?; - let vals_iter = BedParser::from_bed_file(infile); - let chsi = - BedParserStreamingIterator::new(vals_iter, allow_out_of_order_chroms); + let chsi = BedParserStreamingIterator::from_bed_file( + infile, + allow_out_of_order_chroms, + ); Ok(chsi) }, diff --git a/bigtools/tests/bigbedwrite.rs b/bigtools/tests/bigbedwrite.rs index c3164fb..f2e89b4 100644 --- a/bigtools/tests/bigbedwrite.rs +++ b/bigtools/tests/bigbedwrite.rs @@ -1,5 +1,6 @@ use std::error::Error; +use bigtools::bed::bedparser::{BedFileStream, StreamingBedValues}; use bigtools::bedchromdata::BedParserStreamingIterator; use tokio::runtime; @@ -11,7 +12,6 @@ fn bigbedwrite_test() -> Result<(), Box> { use tempfile; - use bigtools::bed::bedparser::BedParser; use bigtools::{BigBedRead, BigBedWrite}; let mut dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")); @@ -22,9 +22,8 @@ fn bigbedwrite_test() -> Result<(), Box> { let first = { let infile = File::open(bed.clone())?; - let mut vals_iter = BedParser::from_bed_file(infile); - let (_, mut group) = vals_iter.next_chrom().unwrap().unwrap(); - group.next().unwrap().unwrap() + let mut vals_iter = BedFileStream::from_bed_file(infile); + vals_iter.next().unwrap().unwrap().1 }; let runtime = runtime::Builder::new_multi_thread() @@ -32,14 +31,14 @@ fn bigbedwrite_test() -> Result<(), Box> { .build() .expect("Unable to create runtime."); - let infile = File::open(bed)?; let tempfile = tempfile::NamedTempFile::new()?; - let mut vals_iter = BedParser::from_bed_file(infile); let mut outb = BigBedWrite::create_file(tempfile.path().to_string_lossy().to_string()); outb.autosql = { - let (_, mut group) = vals_iter.next_chrom().unwrap().unwrap(); - let first = group.peek_val().unwrap(); - Some(bigtools::bed::autosql::bed_autosql(&first.rest)) + let infile = File::open(&bed)?; + let mut vals_iter = BedFileStream::from_bed_file(infile); + Some(bigtools::bed::autosql::bed_autosql( + &vals_iter.next().unwrap().unwrap().1.rest, + )) }; outb.options.compress = false; @@ -48,7 +47,8 @@ fn bigbedwrite_test() -> Result<(), Box> { chrom_map.insert("chr18".to_string(), 80373285); chrom_map.insert("chr19".to_string(), 58617616); - let chsi = BedParserStreamingIterator::new(vals_iter, false); + let infile = File::open(bed)?; + let chsi = BedParserStreamingIterator::from_bed_file(infile, false); outb.write(chrom_map, chsi, runtime).unwrap(); let mut bwread = BigBedRead::open_file(&tempfile.path().to_string_lossy()).unwrap(); diff --git a/bigtools/tests/bigwigwrite.rs b/bigtools/tests/bigwigwrite.rs index 56e2a81..3a7deeb 100644 --- a/bigtools/tests/bigwigwrite.rs +++ b/bigtools/tests/bigwigwrite.rs @@ -6,7 +6,7 @@ use std::path::PathBuf; use tempfile; -use bigtools::bed::bedparser::BedParser; +use bigtools::bed::bedparser::{BedFileStream, StreamingBedValues}; use bigtools::bedchromdata::BedParserStreamingIterator; use bigtools::{BigWigRead, BigWigWrite, Value}; use tokio::runtime; @@ -21,9 +21,8 @@ fn test() -> Result<(), Box> { let first = { let infile = File::open(single_chrom_bedgraph.clone())?; - let mut vals_iter = BedParser::from_bedgraph_file(infile); - let (_, mut group) = vals_iter.next_chrom().unwrap().unwrap(); - group.next().unwrap().unwrap() + let mut vals_iter = BedFileStream::from_bedgraph_file(infile); + vals_iter.next().unwrap().unwrap().1 }; let runtime = runtime::Builder::new_multi_thread() @@ -33,13 +32,12 @@ fn test() -> Result<(), Box> { let infile = File::open(single_chrom_bedgraph)?; let tempfile = tempfile::NamedTempFile::new()?; - let vals_iter = BedParser::from_bedgraph_file(infile); let outb = BigWigWrite::create_file(tempfile.path().to_string_lossy().to_string()); let mut chrom_map = HashMap::new(); chrom_map.insert("chr17".to_string(), 83257441); - let chsi = BedParserStreamingIterator::new(vals_iter, false); + let chsi = BedParserStreamingIterator::from_bedgraph_file(infile, false); outb.write(chrom_map, chsi, runtime).unwrap(); let mut bwread = BigWigRead::open_file(&tempfile.path().to_string_lossy()).unwrap(); @@ -68,9 +66,8 @@ fn test_multi_pass() -> Result<(), Box> { let first = { let infile = File::open(single_chrom_bedgraph.clone())?; - let mut vals_iter = BedParser::from_bedgraph_file(infile); - let (_, mut group) = vals_iter.next_chrom().unwrap().unwrap(); - group.next().unwrap().unwrap() + let mut vals_iter = BedFileStream::from_bedgraph_file(infile); + vals_iter.next().unwrap().unwrap().1 }; let runtime = runtime::Builder::new_multi_thread() @@ -88,8 +85,7 @@ fn test_multi_pass() -> Result<(), Box> { outb.write_multipass( || { let infile = File::open(single_chrom_bedgraph.clone())?; - let vals_iter = BedParser::from_bedgraph_file(infile); - let chsi = BedParserStreamingIterator::new(vals_iter, false); + let chsi = BedParserStreamingIterator::from_bedgraph_file(infile, false); Ok(chsi) }, chrom_map, @@ -128,7 +124,6 @@ fn test_multi_chrom() -> io::Result<()> { let infile = File::open(multi_chrom_bedgraph)?; let tempfile = tempfile::NamedTempFile::new()?; - let vals_iter = BedParser::from_bedgraph_file(infile); let outb = BigWigWrite::create_file(tempfile.path().to_string_lossy().to_string()); let mut chrom_map = HashMap::new(); @@ -139,7 +134,7 @@ fn test_multi_chrom() -> io::Result<()> { chrom_map.insert("chr5".to_string(), 181538259); chrom_map.insert("chr6".to_string(), 170805979); - let chsi = BedParserStreamingIterator::new(vals_iter, false); + let chsi = BedParserStreamingIterator::from_bedgraph_file(infile, false); outb.write(chrom_map, chsi, runtime).unwrap(); let mut bwread = BigWigRead::open_file(&tempfile.path().to_string_lossy()).unwrap(); @@ -181,7 +176,7 @@ fn test_iter() { ) }); - let vals_iter = BedParserStreamingIterator::new(BedParser::wrap_infallible_iter(iter), true); + let vals_iter = BedParserStreamingIterator::wrap_infallible_iter(iter, true); let chrom_map = HashMap::from([("chrY".to_string(), 57_227_415)]); diff --git a/pybigtools/src/lib.rs b/pybigtools/src/lib.rs index ac7ade8..51bc1c4 100644 --- a/pybigtools/src/lib.rs +++ b/pybigtools/src/lib.rs @@ -7,7 +7,6 @@ use std::ops::IndexMut; use std::path::Path; use bigtools::bed::autosql::parse::parse_autosql; -use bigtools::bed::bedparser::BedParser; use bigtools::bedchromdata::BedParserStreamingIterator; #[cfg(feature = "remote")] use bigtools::utils::file::remote_file::RemoteFile; @@ -2027,8 +2026,7 @@ impl BigWigWrite { Err(e) => Err(io::Error::new(io::ErrorKind::Other, format!("{}", e.0))), Ok(v) => Ok(v), }); - let vals_iter = BedParser::wrap_iter(vals_iter_raw); - let chsi = BedParserStreamingIterator::new(vals_iter, true); + let chsi = BedParserStreamingIterator::wrap_iter(vals_iter_raw, true); match bigwig.write(chrom_map, chsi, runtime) { Err(e) => println!("{}", e), Ok(_) => {} @@ -2180,8 +2178,7 @@ impl BigBedWrite { Err(e) => Err(io::Error::new(io::ErrorKind::Other, format!("{}", e.0))), Ok(v) => Ok(v), }); - let vals_iter = BedParser::wrap_iter(vals_iter_raw); - let chsi = BedParserStreamingIterator::new(vals_iter, true); + let chsi = BedParserStreamingIterator::wrap_iter(vals_iter_raw, true); match bigbed.write(chrom_map, chsi, runtime) { Err(e) => { println!("{}", e) From 3f3e9bd1d52e6c6f39ca856bbc7f7075cb8faeb9 Mon Sep 17 00:00:00 2001 From: Jack Huey <31162821+jackh726@users.noreply.github.com> Date: Sun, 21 Apr 2024 03:17:33 -0400 Subject: [PATCH 15/31] A couple minor things --- bigtools/src/bbi/bbiwrite.rs | 4 ++-- bigtools/src/bbi/bigwigwrite.rs | 26 ++++++++++++++++---------- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/bigtools/src/bbi/bbiwrite.rs b/bigtools/src/bbi/bbiwrite.rs index 548909f..267a42b 100644 --- a/bigtools/src/bbi/bbiwrite.rs +++ b/bigtools/src/bbi/bbiwrite.rs @@ -589,7 +589,7 @@ type DataWithoutzooms = ( futures::future::RemoteHandle>>, ); -pub(crate) async fn write_chroms_with_zooms( +async fn write_chroms_with_zooms( mut file: BufWriter, mut zooms_map: BTreeMap, mut receiver: futures_mpsc::UnboundedReceiver>, @@ -695,7 +695,7 @@ pub struct InternalProcessData( pub(crate) u32, ); -pub mod process_internal { +pub(crate) mod process_internal { pub trait ChromProcessCreate { type I; type Out; diff --git a/bigtools/src/bbi/bigwigwrite.rs b/bigtools/src/bbi/bigwigwrite.rs index ffaf718..58ecbab 100644 --- a/bigtools/src/bbi/bigwigwrite.rs +++ b/bigtools/src/bbi/bigwigwrite.rs @@ -245,7 +245,7 @@ impl BigWigWrite { Ok(()) } - async fn process_val( + async fn process_val( current_val: Value, next_val: Option<&Value>, chrom_length: u32, @@ -256,20 +256,20 @@ impl BigWigWrite { runtime: &Handle, ftx: &mut ChromProcessingInputSectionChannel, chrom_id: u32, - ) -> Result<(), ProcessChromError> { + ) -> Result<(), BigWigInvalidInput> { // Check a few preconditions: // - The current end is greater than or equal to the start // - The current end is at most the chromosome length // - If there is a next value, then it does not overlap value // TODO: test these correctly fails if current_val.start > current_val.end { - return Err(ProcessChromError::InvalidInput(format!( + return Err(BigWigInvalidInput(format!( "Invalid bed graph: {} > {}", current_val.start, current_val.end ))); } if current_val.end > chrom_length { - return Err(ProcessChromError::InvalidInput(format!( + return Err(BigWigInvalidInput(format!( "Invalid bed graph: `{}` is greater than the chromosome ({}) length ({})", current_val.end, chrom, chrom_length ))); @@ -278,7 +278,7 @@ impl BigWigWrite { None => {} Some(next_val) => { if current_val.end > next_val.start { - return Err(ProcessChromError::InvalidInput(format!( + return Err(BigWigInvalidInput(format!( "Invalid bed graph: overlapping values on chromosome {} at {}-{} and {}-{}", chrom, current_val.start, current_val.end, next_val.start, next_val.end, ))); @@ -311,14 +311,14 @@ impl BigWigWrite { Ok(()) } - async fn process_val_zoom( + async fn process_val_zoom( zoom_items: &mut Vec, options: BBIWriteOptions, current_val: Value, next_val: Option<&Value>, runtime: &Handle, chrom_id: u32, - ) -> Result<(), ProcessChromError> { + ) { // Then, add the item to the zoom item queues. This is a bit complicated. for zoom_item in zoom_items.iter_mut() { debug_assert_ne!(zoom_item.records.len(), options.items_per_slot as usize); @@ -395,8 +395,14 @@ impl BigWigWrite { } debug_assert_ne!(zoom_item.records.len(), options.items_per_slot as usize); } + } +} - Ok(()) +struct BigWigInvalidInput(String); + +impl From for ProcessChromError { + fn from(value: BigWigInvalidInput) -> Self { + ProcessChromError::InvalidInput(value.0) } } @@ -518,7 +524,7 @@ impl ChromProcess for BigWigFullProcess { &runtime, chrom_id, ) - .await?; + .await; Ok(()) } @@ -721,7 +727,7 @@ impl ChromProcess for BigWigZoomsProcess { &runtime, *chrom_id, ) - .await?; + .await; Ok(()) } From 03bd96cf53905cee73c1cc9f0cb5dfa44139f546 Mon Sep 17 00:00:00 2001 From: Jack Huey <31162821+jackh726@users.noreply.github.com> Date: Sun, 21 Apr 2024 11:14:37 -0400 Subject: [PATCH 16/31] Make do_read in write_vals a fn --- bigtools/src/bbi/bbiwrite.rs | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/bigtools/src/bbi/bbiwrite.rs b/bigtools/src/bbi/bbiwrite.rs index 267a42b..800f177 100644 --- a/bigtools/src/bbi/bbiwrite.rs +++ b/bigtools/src/bbi/bbiwrite.rs @@ -751,12 +751,31 @@ pub(crate) fn write_vals< let mut chrom_ids = IdMap::default(); let mut summary: Option = None; - let (send, recv) = futures_mpsc::unbounded(); + let (mut send, recv) = futures_mpsc::unbounded(); let write_fut = write_chroms_with_zooms(file, zooms_map, recv); let (write_fut, write_fut_handle) = write_fut.remote_handle(); runtime.spawn(write_fut); - let setup_chrom = || { + fn setup_chrom( + send: &mut futures_mpsc::UnboundedSender<( + crossbeam_channel::Receiver
, + TempFileBuffer>, + futures::future::RemoteHandle>>, + Vec>, + )>, + options: BBIWriteOptions, + runtime: &Runtime, + ) -> ( + Vec<( + u32, + futures_mpsc::Sender< + Pin> + Send>>, + >, + )>, + futures_mpsc::Sender< + Pin> + Send>>, + >, + ) { let (ftx, sections_handle, buf, section_receiver) = future_channel(options.channel_size, runtime.handle(), options.inmemory); @@ -788,7 +807,7 @@ pub(crate) fn write_vals< } (zooms_channels, ftx) - }; + } let mut do_read = |chrom: String| -> Result<_, ProcessChromError<_>> { let length = match chrom_sizes.get(&chrom) { Some(length) => *length, @@ -802,7 +821,7 @@ pub(crate) fn write_vals< // Make a new id for the chromosome let chrom_id = chrom_ids.get_id(&chrom); - let (zooms_channels, ftx) = setup_chrom(); + let (zooms_channels, ftx) = setup_chrom(&mut send, options, &runtime); let internal_data = crate::InternalProcessData( zooms_channels, From 583cbcbd9fbc831866498c49bd0adc303d2ff1fa Mon Sep 17 00:00:00 2001 From: Jack Huey <31162821+jackh726@users.noreply.github.com> Date: Sun, 21 Apr 2024 11:35:24 -0400 Subject: [PATCH 17/31] Don't box for ChromProcessingInputSectionChannel --- bigtools/src/bbi/bbiwrite.rs | 25 +++++++------------------ bigtools/src/bbi/bigbedwrite.rs | 28 +++++++--------------------- bigtools/src/bbi/bigwigwrite.rs | 18 +++++------------- 3 files changed, 19 insertions(+), 52 deletions(-) diff --git a/bigtools/src/bbi/bbiwrite.rs b/bigtools/src/bbi/bbiwrite.rs index 800f177..8e62068 100644 --- a/bigtools/src/bbi/bbiwrite.rs +++ b/bigtools/src/bbi/bbiwrite.rs @@ -4,7 +4,6 @@ use std::fs::File; use std::future::Future; use std::io::{self, BufWriter, Seek, SeekFrom, Write}; use std::iter::Flatten; -use std::pin::Pin; use std::vec; use byteorder::{NativeEndian, WriteBytesExt}; @@ -129,9 +128,8 @@ pub(crate) struct TempZoomInfo { pub sections: crossbeam_channel::Receiver
, } -pub(crate) type ChromProcessingInputSectionChannel = futures::channel::mpsc::Sender< - Pin> + Send>>, ->; +pub(crate) type ChromProcessingInputSectionChannel = + futures_mpsc::Sender>>; const MAX_ZOOM_LEVELS: usize = 10; @@ -766,15 +764,8 @@ pub(crate) fn write_vals< options: BBIWriteOptions, runtime: &Runtime, ) -> ( - Vec<( - u32, - futures_mpsc::Sender< - Pin> + Send>>, - >, - )>, - futures_mpsc::Sender< - Pin> + Send>>, - >, + Vec<(u32, ChromProcessingInputSectionChannel)>, + ChromProcessingInputSectionChannel, ) { let (ftx, sections_handle, buf, section_receiver) = future_channel(options.channel_size, runtime.handle(), options.inmemory); @@ -1280,13 +1271,13 @@ pub(crate) fn write_mid( async fn write_data( mut data_file: W, section_sender: crossbeam_channel::Sender
, - mut frx: futures_mpsc::Receiver> + Send>, + mut frx: futures_mpsc::Receiver>>, ) -> Result<(usize, usize), ProcessChromError> { let mut current_offset = 0; let mut total = 0; let mut max_uncompressed_buf_size = 0; while let Some(section_raw) = frx.next().await { - let (section, uncompressed_buf_size): (SectionData, usize) = section_raw.await?; + let (section, uncompressed_buf_size): (SectionData, usize) = section_raw.await.unwrap()?; max_uncompressed_buf_size = max_uncompressed_buf_size.max(uncompressed_buf_size); total += 1; let size = section.data.len() as u64; @@ -1310,9 +1301,7 @@ pub(crate) fn future_channel ( - futures_mpsc::Sender< - Pin> + Send>>, - >, + ChromProcessingInputSectionChannel, futures::future::RemoteHandle>>, TempFileBuffer, crossbeam_channel::Receiver
, diff --git a/bigtools/src/bbi/bigbedwrite.rs b/bigtools/src/bbi/bigbedwrite.rs index 80bde65..97cbf2d 100644 --- a/bigtools/src/bbi/bigbedwrite.rs +++ b/bigtools/src/bbi/bigbedwrite.rs @@ -4,7 +4,6 @@ use std::ffi::CString; use std::fs::File; use std::io::{self, BufWriter, Write}; -use futures::future::FutureExt; use futures::sink::SinkExt; use byteorder::{NativeEndian, WriteBytesExt}; @@ -364,10 +363,8 @@ impl BigBedWrite { if next_val.is_none() || items.len() >= options.items_per_slot as usize { let items = std::mem::replace(items, Vec::with_capacity(options.items_per_slot as usize)); - let handle = runtime - .spawn(encode_section(options.compress, items, chrom_id)) - .map(|f| f.unwrap()); - ftx.send(handle.boxed()).await.expect("Couldn't send"); + let handle = runtime.spawn(encode_section(options.compress, items, chrom_id)); + ftx.send(handle).await.expect("Couldn't send"); } Ok(()) @@ -453,14 +450,9 @@ impl BigBedWrite { } if !zoom_item.records.is_empty() { let items = std::mem::take(&mut zoom_item.records); - let handle = runtime - .spawn(encode_zoom_section(options.compress, items)) - .map(|f| f.unwrap()); - zoom_item - .channel - .send(handle.boxed()) - .await - .expect("Couln't send"); + let handle = + runtime.spawn(encode_zoom_section(options.compress, items)); + zoom_item.channel.send(handle).await.expect("Couln't send"); } } break; @@ -515,14 +507,8 @@ impl BigBedWrite { // Write section if full if zoom_item.records.len() == options.items_per_slot as usize { let items = std::mem::take(&mut zoom_item.records); - let handle = runtime - .spawn(encode_zoom_section(options.compress, items)) - .map(|f| f.unwrap()); - zoom_item - .channel - .send(handle.boxed()) - .await - .expect("Couln't send"); + let handle = runtime.spawn(encode_zoom_section(options.compress, items)); + zoom_item.channel.send(handle).await.expect("Couln't send"); } } } diff --git a/bigtools/src/bbi/bigwigwrite.rs b/bigtools/src/bbi/bigwigwrite.rs index 58ecbab..f105109 100644 --- a/bigtools/src/bbi/bigwigwrite.rs +++ b/bigtools/src/bbi/bigwigwrite.rs @@ -44,7 +44,6 @@ use std::fs::File; use std::io::{self, BufWriter, Write}; use std::vec; -use futures::future::FutureExt; use futures::sink::SinkExt; use byteorder::{NativeEndian, WriteBytesExt}; @@ -302,10 +301,9 @@ impl BigWigWrite { items.push(current_val); if next_val.is_none() || items.len() >= options.items_per_slot as usize { let items = std::mem::take(items); - let handle = runtime - .spawn(encode_section(options.compress, items, chrom_id)) - .map(|f| f.unwrap()); - ftx.send(handle.boxed()).await.expect("Couldn't send"); + let handle: tokio::task::JoinHandle> = + runtime.spawn(encode_section(options.compress, items, chrom_id)); + ftx.send(handle).await.expect("Couldn't send"); } Ok(()) @@ -338,14 +336,8 @@ impl BigWigWrite { || zoom_item.records.len() == options.items_per_slot as usize { let items = std::mem::take(&mut zoom_item.records); - let handle = runtime - .spawn(encode_zoom_section(options.compress, items)) - .map(|f| f.unwrap()); - zoom_item - .channel - .send(handle.boxed()) - .await - .expect("Couln't send"); + let handle = runtime.spawn(encode_zoom_section(options.compress, items)); + zoom_item.channel.send(handle).await.expect("Couln't send"); } if add_start >= current_val.end { if next_val.is_none() { From c8fb881defbc769c31a88d19fade988d083ea124 Mon Sep 17 00:00:00 2001 From: Jack Huey <31162821+jackh726@users.noreply.github.com> Date: Sun, 21 Apr 2024 11:53:08 -0400 Subject: [PATCH 18/31] Stop using remote_handle --- bigtools/src/bbi/bbiwrite.rs | 51 ++++++++++++++++----------------- bigtools/src/bbi/bigwigwrite.rs | 2 +- 2 files changed, 25 insertions(+), 28 deletions(-) diff --git a/bigtools/src/bbi/bbiwrite.rs b/bigtools/src/bbi/bbiwrite.rs index 8e62068..d04d18d 100644 --- a/bigtools/src/bbi/bbiwrite.rs +++ b/bigtools/src/bbi/bbiwrite.rs @@ -12,7 +12,6 @@ use thiserror::Error; use futures::channel::mpsc as futures_mpsc; use futures::channel::mpsc::channel; -use futures::future::FutureExt; use futures::stream::StreamExt; use serde::{Deserialize, Serialize}; @@ -121,9 +120,8 @@ pub enum ProcessChromError { pub(crate) struct TempZoomInfo { pub resolution: u32, - pub data_write_future: Box< - dyn Future>> + Send + Unpin, - >, + pub data_write_future: + tokio::task::JoinHandle>>, pub data: TempFileBuffer>, pub sections: crossbeam_channel::Receiver
, } @@ -578,13 +576,13 @@ pub(crate) type ZoomValue = ( type Data = ( crossbeam_channel::Receiver
, TempFileBuffer>, - futures::future::RemoteHandle>>, + tokio::task::JoinHandle>>, Vec>, ); type DataWithoutzooms = ( crossbeam_channel::Receiver
, TempFileBuffer>, - futures::future::RemoteHandle>>, + tokio::task::JoinHandle>>, ); async fn write_chroms_with_zooms( @@ -623,7 +621,7 @@ async fn write_chroms_with_zooms( } // All the futures are actually just handles, so these are purely for the result - let (_num_sections, uncompressed_buf_size) = data_write_future.await?; + let (_num_sections, uncompressed_buf_size) = data_write_future.await.unwrap()?; max_uncompressed_buf_size = max_uncompressed_buf_size.max(uncompressed_buf_size); section_iter.push(sections.into_iter()); file = data.await_real_file(); @@ -637,7 +635,7 @@ async fn write_chroms_with_zooms( { let zoom = zooms_map.get_mut(&resolution).unwrap(); let data_write_data = data_write_future.await; - let (_num_sections, uncompressed_buf_size) = match data_write_data { + let (_num_sections, uncompressed_buf_size) = match data_write_data.unwrap() { Ok(d) => d, Err(e) => return Err(e), }; @@ -674,7 +672,7 @@ async fn write_chroms_without_zooms( data.switch(file); // All the futures are actually just handles, so these are purely for the result - let (_num_sections, uncompressed_buf_size) = data_write_future.await?; + let (_num_sections, uncompressed_buf_size) = data_write_future.await.unwrap()?; max_uncompressed_buf_size = max_uncompressed_buf_size.max(uncompressed_buf_size); section_iter.push(sections.into_iter()); file = data.await_real_file(); @@ -751,14 +749,13 @@ pub(crate) fn write_vals< let mut summary: Option = None; let (mut send, recv) = futures_mpsc::unbounded(); let write_fut = write_chroms_with_zooms(file, zooms_map, recv); - let (write_fut, write_fut_handle) = write_fut.remote_handle(); - runtime.spawn(write_fut); + let write_fut_handle = runtime.spawn(write_fut); fn setup_chrom( send: &mut futures_mpsc::UnboundedSender<( crossbeam_channel::Receiver
, TempFileBuffer>, - futures::future::RemoteHandle>>, + tokio::task::JoinHandle>>, Vec>, )>, options: BBIWriteOptions, @@ -778,11 +775,11 @@ pub(crate) fn write_vals< std::iter::successors(Some(options.initial_zoom_size), |z| Some(z * 4)) .take(options.max_zooms as usize); for size in zoom_sizes { - let (ftx, handle, buf, section_receiver) = + let (ftx, data_write_future, buf, section_receiver) = future_channel(options.channel_size, runtime.handle(), options.inmemory); let zoom_info = TempZoomInfo { resolution: size, - data_write_future: Box::new(handle), + data_write_future, data: buf, sections: section_receiver, }; @@ -857,7 +854,7 @@ pub(crate) fn write_vals< }); let (file, max_uncompressed_buf_size, section_iter, zooms_map) = - runtime.block_on(write_fut_handle)?; + runtime.block_on(write_fut_handle).unwrap()?; let zoom_infos: Vec = zooms_map .into_iter() @@ -927,8 +924,7 @@ pub(crate) fn write_vals_no_zoom< let mut summary: Option = None; let (send, recv) = futures_mpsc::unbounded(); let write_fut = write_chroms_without_zooms::(file, recv); - let (write_fut, write_fut_handle) = write_fut.remote_handle(); - runtime.spawn(write_fut); + let write_fut_handle = runtime.spawn(write_fut); let setup_chrom = || { let (ftx, sections_handle, buf, section_receiver) = @@ -1004,7 +1000,8 @@ pub(crate) fn write_vals_no_zoom< sum_squares: 0.0, }); - let (file, max_uncompressed_buf_size, section_iter) = runtime.block_on(write_fut_handle)?; + let (file, max_uncompressed_buf_size, section_iter) = + runtime.block_on(write_fut_handle).unwrap()?; let section_iter = section_iter.into_iter().flatten(); Ok(( @@ -1026,9 +1023,9 @@ type InternalZoomValue = ( pub(crate) struct InternalTempZoomInfo { pub resolution: u32, - pub data_write_future: Box< - dyn Future>> + Send + Unpin, - >, + + pub data_write_future: + tokio::task::JoinHandle>>, pub data: TempFileBuffer>>, pub sections: crossbeam_channel::Receiver
, } @@ -1101,11 +1098,11 @@ pub(crate) fn write_zoom_vals< let mut zooms_channels = Vec::with_capacity(options.max_zooms as usize); for size in resolutions.iter().copied() { - let (ftx, handle, buf, section_receiver) = + let (ftx, data_write_future, buf, section_receiver) = future_channel(options.channel_size, runtime.handle(), options.inmemory); let zoom_info = InternalTempZoomInfo { resolution: size, - data_write_future: Box::new(handle), + data_write_future, data: buf, sections: section_receiver, }; @@ -1150,7 +1147,7 @@ pub(crate) fn write_zoom_vals< { // First, we need to make sure that all the sections that were queued to encode have been written let data_write_data = runtime.block_on(data_write_future); - let (_num_sections, uncompressed_buf_size) = match data_write_data { + let (_num_sections, uncompressed_buf_size) = match data_write_data.unwrap() { Ok(d) => d, Err(e) => return Err(e), }; @@ -1302,7 +1299,7 @@ pub(crate) fn future_channel ( ChromProcessingInputSectionChannel, - futures::future::RemoteHandle>>, + tokio::task::JoinHandle>>, TempFileBuffer, crossbeam_channel::Receiver
, ) { @@ -1311,8 +1308,8 @@ pub(crate) fn future_channel ChromProcessCreate for BigWigZoomsProcess { ZoomsInternalProcessedData(self.temp_zoom_items) } } -impl ChromProcess for BigWigZoomsProcess { +impl ChromProcess for BigWigZoomsProcess { type Value = Value; async fn do_process( &mut self, From 6300bcdb7b42183ceee7da6d1116630665a4fa5a Mon Sep 17 00:00:00 2001 From: Jack Huey <31162821+jackh726@users.noreply.github.com> Date: Sun, 21 Apr 2024 12:23:58 -0400 Subject: [PATCH 19/31] Minor cleanups --- bigtools/src/bbi/bbiwrite.rs | 45 ++++++++++++------------------------ 1 file changed, 15 insertions(+), 30 deletions(-) diff --git a/bigtools/src/bbi/bbiwrite.rs b/bigtools/src/bbi/bbiwrite.rs index d04d18d..b2303f3 100644 --- a/bigtools/src/bbi/bbiwrite.rs +++ b/bigtools/src/bbi/bbiwrite.rs @@ -232,13 +232,10 @@ pub(crate) fn write_chrom_tree( key_bytes[..chrom_bytes.len()].copy_from_slice(chrom_bytes); file.write_all(key_bytes)?; file.write_u32::(*id)?; - let length = chrom_sizes.get(&chrom[..]); - match length { - None => panic!("Expected length for chrom: {}", chrom), - Some(l) => { - file.write_u32::(*l)?; - } - } + let length = chrom_sizes + .get(&chrom[..]) + .expect(&format!("Expected length for chrom: {}", chrom)); + file.write_u32::(*length)?; } Ok(()) } @@ -602,9 +599,8 @@ async fn write_chroms_with_zooms( let mut max_uncompressed_buf_size = 0; loop { let read = receiver.next().await; - let (sections, mut data, data_write_future, mut zooms) = match read { - None => break, - Some(read) => read, + let Some((sections, mut data, data_write_future, mut zooms)) = read else { + break; }; // If we concurrently processing multiple chromosomes, the section buffer might have written some or all to a separate file // Switch that processing output to the real file @@ -635,10 +631,7 @@ async fn write_chroms_with_zooms( { let zoom = zooms_map.get_mut(&resolution).unwrap(); let data_write_data = data_write_future.await; - let (_num_sections, uncompressed_buf_size) = match data_write_data.unwrap() { - Ok(d) => d, - Err(e) => return Err(e), - }; + let (_num_sections, uncompressed_buf_size) = data_write_data.unwrap()?; max_uncompressed_buf_size = max_uncompressed_buf_size.max(uncompressed_buf_size); zoom.0.push(sections.into_iter()); zoom.2.replace(data.await_real_file()); @@ -663,9 +656,8 @@ async fn write_chroms_without_zooms( let mut max_uncompressed_buf_size = 0; loop { let read = receiver.next().await; - let (sections, mut data, data_write_future) = match read { - None => break, - Some(read) => read, + let Some((sections, mut data, data_write_future)) = read else { + break; }; // If we concurrently processing multiple chromosomes, the section buffer might have written some or all to a separate file // Switch that processing output to the real file @@ -789,10 +781,8 @@ pub(crate) fn write_vals< (zoom_infos, zooms_channels) }; - match send.unbounded_send((section_receiver, buf, sections_handle, zoom_infos)) { - Ok(_) => {} - Err(_) => panic!("Expected to always send."), - } + send.unbounded_send((section_receiver, buf, sections_handle, zoom_infos)) + .expect("Expected to always send."); (zooms_channels, ftx) } @@ -930,10 +920,8 @@ pub(crate) fn write_vals_no_zoom< let (ftx, sections_handle, buf, section_receiver) = future_channel(options.channel_size, runtime.handle(), options.inmemory); - match send.unbounded_send((section_receiver, buf, sections_handle)) { - Ok(_) => {} - Err(_) => panic!("Expected to always send."), - } + send.unbounded_send((section_receiver, buf, sections_handle)) + .expect("Expected to always send."); ftx }; @@ -1147,10 +1135,7 @@ pub(crate) fn write_zoom_vals< { // First, we need to make sure that all the sections that were queued to encode have been written let data_write_data = runtime.block_on(data_write_future); - let (_num_sections, uncompressed_buf_size) = match data_write_data.unwrap() { - Ok(d) => d, - Err(e) => return Err(e), - }; + let (_num_sections, uncompressed_buf_size) = data_write_data.unwrap()?; max_uncompressed_buf_size = max_uncompressed_buf_size.max(uncompressed_buf_size); let zoom = zooms_map.get_mut(&resolution).unwrap(); @@ -1313,7 +1298,7 @@ pub(crate) fn future_channel Date: Mon, 6 May 2024 21:13:55 -0400 Subject: [PATCH 20/31] Fix perf for BedParserStreamingIterator by moving out block_on and refactor write_zoom_vals --- bigtools/src/bbi/bbiwrite.rs | 120 ++++++++++++++++------------ bigtools/src/bbi/bedchromdata.rs | 131 ++++++++++++++++--------------- bigtools/src/bbi/bigwigwrite.rs | 3 +- 3 files changed, 137 insertions(+), 117 deletions(-) diff --git a/bigtools/src/bbi/bbiwrite.rs b/bigtools/src/bbi/bbiwrite.rs index b2303f3..bab825c 100644 --- a/bigtools/src/bbi/bbiwrite.rs +++ b/bigtools/src/bbi/bbiwrite.rs @@ -1003,11 +1003,11 @@ pub(crate) fn write_vals_no_zoom< } // Zooms have to be double-buffered: first because chroms could be processed in parallel and second because we don't know the offset of each zoom immediately -type InternalZoomValue = ( - Vec>, - TempFileBuffer>, - Option>>, -); +type ZoomSender = futures_mpsc::Sender<( + tokio::task::JoinHandle>, + TempFileBuffer>>, + crossbeam_channel::Receiver
, +)>; pub(crate) struct InternalTempZoomInfo { pub resolution: u32, @@ -1046,7 +1046,9 @@ pub(crate) fn write_zoom_vals< data_size: u64, ) -> Result<(BufWriter, Vec, usize), ProcessChromError> { let min_first_zoom_size = average_size.max(10) * 4; - let mut zooms_map: BTreeMap = zoom_counts + let mut zoom_receivers = vec![]; + let mut zoom_files = vec![]; + let mut zooms_map: BTreeMap>> = zoom_counts .into_iter() .skip_while(|z| z.0 > min_first_zoom_size as u64) .skip_while(|z| { @@ -1058,18 +1060,19 @@ pub(crate) fn write_zoom_vals< }) .take(options.max_zooms as usize) .map(|size| { - let section_iter = vec![]; let (buf, write) = TempFileBuffer::new(options.inmemory); - let value = (section_iter, buf, Some(write)); - (size.0 as u32, value) + let (sender, receiver) = futures_mpsc::channel(chrom_ids.len()); + zoom_receivers.push((size.0, receiver, write)); + zoom_files.push((size.0 as u32, buf)); + (size.0 as u32, sender) }) .collect(); let resolutions: Vec<_> = zooms_map.keys().copied().collect(); let first_zoom_data_offset = file.tell()?; // We can immediately start to write to the file the first zoom - match zooms_map.first_entry() { - Some(mut first) => first.get_mut().1.switch(file), + match zoom_files.first_mut() { + Some(first) => first.1.switch(file), None => return Ok((file, vec![], 0)), } @@ -1112,54 +1115,69 @@ pub(crate) fn write_zoom_vals< let mut advance = |p: P| { let data = p.destroy(); - let ZoomsInternalProcessedData(mut zooms) = data; - - // For each zoom, switch the current chromosome to write to the actual zoom file - for InternalTempZoomInfo { - resolution: size, - data, - .. - } in zooms.iter_mut() - { - let zoom = zooms_map.get_mut(size).unwrap(); - let writer = zoom.2.take().unwrap(); - data.switch(writer); - } + let ZoomsInternalProcessedData(zooms) = data; for InternalTempZoomInfo { resolution, - data_write_future, data, + data_write_future, sections, + .. } in zooms.into_iter() { - // First, we need to make sure that all the sections that were queued to encode have been written - let data_write_data = runtime.block_on(data_write_future); - let (_num_sections, uncompressed_buf_size) = data_write_data.unwrap()?; - max_uncompressed_buf_size = max_uncompressed_buf_size.max(uncompressed_buf_size); - let zoom = zooms_map.get_mut(&resolution).unwrap(); - // Add the section data to the zoom - zoom.0.push(sections.into_iter()); - // Replace the zoom file again - zoom.2.replace(data.await_real_file()); + zoom.try_send((data_write_future, data, sections)).unwrap(); } Ok(()) }; + let mut zooms = Vec::with_capacity(zoom_receivers.len()); + for rcv in zoom_receivers { + let mut sections = vec![]; + let handle = runtime.spawn(async move { + let (_, mut rcv, mut real_file) = rcv; + while let Some(r) = rcv.next().await { + let (data_write_future, mut data, sections_rcv) = r; + + data.switch(real_file); + + // First, we need to make sure that all the sections that were queued to encode have been written + let data_write_data = data_write_future.await; + let (_num_sections, uncompressed_buf_size) = match data_write_data.unwrap() { + Ok(d) => d, + Err(e) => { + return Err(e); + } + }; + max_uncompressed_buf_size = max_uncompressed_buf_size.max(uncompressed_buf_size); + + // Replace the zoom file again + real_file = data.await_real_file(); + + sections.push(sections_rcv.into_iter()); + } + Ok((real_file, sections)) + }); + zooms.push(handle); + } + vals_iter.process_to_bbi(&runtime, &mut do_read, &mut advance)?; - let mut zoom_entries = Vec::with_capacity(zooms_map.len()); - let mut zooms_map_iter = zooms_map.into_iter(); + drop(zooms_map); + + let mut zoom_entries = Vec::with_capacity(zooms.len()); + let zooms_iter = zooms.into_iter(); + let zooms_files = zoom_files.into_iter(); + let mut zip = zooms_iter.zip(zooms_files); + + // We already switched this zoom to the real file, so need to treat this a bit different + let (first_zoom_fut, first_data) = zip.next().expect("Should have at least one zoom"); + let first_zoom = runtime.block_on(first_zoom_fut).unwrap()?; - // Since the first zoom has already been written to the file, no need to - let first_zoom = zooms_map_iter - .next() - .expect("Should have at least one zoom"); // First, we can drop the writer - no more data - drop(first_zoom.1 .2); - let first_zoom_sections = first_zoom.1 .0.into_iter().flatten(); + drop(first_zoom.0); + let first_zoom_sections = first_zoom.1.into_iter().flatten(); let mut current_offset = first_zoom_data_offset; let sections_iter = first_zoom_sections.map(|mut section| { // TODO: assumes contiguous, see note for primary data @@ -1167,24 +1185,25 @@ pub(crate) fn write_zoom_vals< current_offset += section.size; section }); - // First zoom has already switched, real data - file = first_zoom.1 .1.await_real_file(); + file = first_data.1.await_real_file(); // Generate the rtree index let (nodes, levels, total_sections) = get_rtreeindex(sections_iter, options); let first_zoom_index_offset = file.tell()?; write_rtreeindex(&mut file, nodes, levels, total_sections, options)?; zoom_entries.push(ZoomHeader { - reduction_level: first_zoom.0, + reduction_level: first_data.0, data_offset: first_zoom_data_offset, index_offset: first_zoom_index_offset, index_tree_offset: None, }); - while let Some(mut zoom) = zooms_map_iter.next() { + while let Some(zoom) = zip.next() { + let (zoom_fut, data) = zoom; + let (real_file, sections) = runtime.block_on(zoom_fut).unwrap()?; let zoom_data_offset = file.tell()?; // First, we can drop the writer - no more data - drop(zoom.1 .2); - let zoom_sections = zoom.1 .0.into_iter().flatten(); + drop(real_file); + let zoom_sections = sections.into_iter().flatten(); let mut current_offset = zoom_data_offset; let sections_iter = zoom_sections.map(|mut section| { // TODO: assumes contiguous, see note for primary data @@ -1193,14 +1212,13 @@ pub(crate) fn write_zoom_vals< section }); // Subsequence zooms have not switched to real file - zoom.1 .1.switch(file); - file = zoom.1 .1.await_real_file(); + data.1.expect_closed_write(&mut file)?; // Generate the rtree index let (nodes, levels, total_sections) = get_rtreeindex(sections_iter, options); let zoom_index_offset = file.tell()?; write_rtreeindex(&mut file, nodes, levels, total_sections, options)?; zoom_entries.push(ZoomHeader { - reduction_level: first_zoom.0, + reduction_level: data.0, data_offset: zoom_data_offset, index_offset: zoom_index_offset, index_tree_offset: None, diff --git a/bigtools/src/bbi/bedchromdata.rs b/bigtools/src/bbi/bedchromdata.rs index 2e25beb..597083c 100644 --- a/bigtools/src/bbi/bedchromdata.rs +++ b/bigtools/src/bbi/bedchromdata.rs @@ -99,74 +99,75 @@ impl ChromData for BedParserStreamingIterator { start_processing: &mut StartProcessing, advance: &mut Advance, ) -> Result<(), ProcessChromError> { - let mut state: Option<(String, P, Option>)> = None; - - loop { - let (curr_value, new_state) = match state { - Some((c, p, Some(v))) => (Some(v), Some((c, p))), - Some((c, p, None)) => (self.bed_data.next(), Some((c, p))), - None => (self.bed_data.next(), None), - }; - state = match (new_state, curr_value) { - // The next value is an error, but we never started - (None, Some(Err(e))) => return Err(ProcessChromError::SourceError(e)), - // There are no values at all - (None, None) => return Ok(()), - // There are no more values - (Some(state), None) => { - advance(state.1)?; - return Ok(()); - } - // The next value is an error and we have seen values before - (Some(state), Some(Err(e))) => { - // We *can* do anything since we've encountered an error. - // We'll go ahead and try to finish what we can, before we return. - advance(state.1)?; - return Err(ProcessChromError::SourceError(e)); - } - // The next value is the first - (None, Some(Ok((chrom, val)))) => { - let chrom = chrom.to_string(); - let mut p = start_processing(chrom.clone())?; - let next_val = self.bed_data.next(); - let next_value = match &next_val { - Some(Ok(v)) if v.0 == chrom => Some(&v.1), - _ => None, - }; - runtime.block_on(p.do_process(val, next_value))?; - Some((chrom, p, next_val)) - } - // The next value is the same chromosome - (Some((prev_chrom, mut p)), Some(Ok((chrom, val)))) if chrom == &prev_chrom => { - let next_val = self.bed_data.next(); - let next_value = match &next_val { - Some(Ok(v)) if v.0 == prev_chrom => Some(&v.1), - _ => None, - }; - runtime.block_on(p.do_process(val, next_value))?; - Some((prev_chrom, p, next_val)) - } - // The next value is a different chromosome - (Some((prev_chrom, p)), Some(Ok((chrom, val)))) => { - // TODO: test this correctly fails - if !self.allow_out_of_order_chroms && prev_chrom.as_str() >= chrom { - return Err(ProcessChromError::SourceError(BedValueError::InvalidInput("Input bedGraph not sorted by chromosome. Sort with `sort -k1,1 -k2,2n`.".to_string()))); + runtime.block_on(async move { + let mut state: Option<(String, P, Option>)> = None; + loop { + let (curr_value, new_state) = match state { + Some((c, p, Some(v))) => (Some(v), Some((c, p))), + Some((c, p, None)) => (self.bed_data.next(), Some((c, p))), + None => (self.bed_data.next(), None), + }; + state = match (new_state, curr_value) { + // The next value is an error, but we never started + (None, Some(Err(e))) => return Err(ProcessChromError::SourceError(e)), + // There are no values at all + (None, None) => return Ok(()), + // There are no more values + (Some(state), None) => { + advance(state.1)?; + return Ok(()); + } + // The next value is an error and we have seen values before + (Some(state), Some(Err(e))) => { + // We *can* do anything since we've encountered an error. + // We'll go ahead and try to finish what we can, before we return. + advance(state.1)?; + return Err(ProcessChromError::SourceError(e)); } - advance(p)?; + // The next value is the first + (None, Some(Ok((chrom, val)))) => { + let chrom = chrom.to_string(); + let mut p = start_processing(chrom.clone())?; + let next_val = self.bed_data.next(); + let next_value = match &next_val { + Some(Ok(v)) if v.0 == chrom => Some(&v.1), + _ => None, + }; + p.do_process(val, next_value).await?; + Some((chrom, p, next_val)) + } + // The next value is the same chromosome + (Some((prev_chrom, mut p)), Some(Ok((chrom, val)))) if chrom == &prev_chrom => { + let next_val = self.bed_data.next(); + let next_value = match &next_val { + Some(Ok(v)) if v.0 == prev_chrom => Some(&v.1), + _ => None, + }; + p.do_process(val, next_value).await?; + Some((prev_chrom, p, next_val)) + } + // The next value is a different chromosome + (Some((prev_chrom, p)), Some(Ok((chrom, val)))) => { + // TODO: test this correctly fails + if !self.allow_out_of_order_chroms && prev_chrom.as_str() >= chrom { + return Err(ProcessChromError::SourceError(BedValueError::InvalidInput("Input bedGraph not sorted by chromosome. Sort with `sort -k1,1 -k2,2n`.".to_string()))); + } + advance(p)?; - let chrom = chrom.to_string(); - let mut p = start_processing(chrom.clone())?; - let next_val = self.bed_data.next(); - let next_value = match &next_val { - Some(Ok(v)) if v.0 == chrom => Some(&v.1), - _ => None, - }; + let chrom = chrom.to_string(); + let mut p = start_processing(chrom.clone())?; + let next_val = self.bed_data.next(); + let next_value = match &next_val { + Some(Ok(v)) if v.0 == chrom => Some(&v.1), + _ => None, + }; - runtime.block_on(p.do_process(val, next_value))?; - Some((chrom, p, next_val)) - } - }; - } + p.do_process(val, next_value).await?; + Some((chrom, p, next_val)) + } + }; + } + }) } } diff --git a/bigtools/src/bbi/bigwigwrite.rs b/bigtools/src/bbi/bigwigwrite.rs index d794a7b..0e45ef8 100644 --- a/bigtools/src/bbi/bigwigwrite.rs +++ b/bigtools/src/bbi/bigwigwrite.rs @@ -300,7 +300,8 @@ impl BigWigWrite { // Then, add the current item to the actual values, and encode if full, or last item items.push(current_val); if next_val.is_none() || items.len() >= options.items_per_slot as usize { - let items = std::mem::take(items); + let items = + std::mem::replace(items, Vec::with_capacity(options.items_per_slot as usize)); let handle: tokio::task::JoinHandle> = runtime.spawn(encode_section(options.compress, items, chrom_id)); ftx.send(handle).await.expect("Couldn't send"); From d186edb8461f6c6cc30f962d21644d6dec18606e Mon Sep 17 00:00:00 2001 From: Jack Huey <31162821+jackh726@users.noreply.github.com> Date: Tue, 7 May 2024 16:11:37 -0400 Subject: [PATCH 21/31] Add Send bound to Er --- bigtools/src/bbi/bigbedwrite.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigtools/src/bbi/bigbedwrite.rs b/bigtools/src/bbi/bigbedwrite.rs index 97cbf2d..98e8326 100644 --- a/bigtools/src/bbi/bigbedwrite.rs +++ b/bigtools/src/bbi/bigbedwrite.rs @@ -854,7 +854,7 @@ impl ChromProcessCreate for BigBedZoomsProcess { ZoomsInternalProcessedData(self.temp_zoom_items) } } -impl ChromProcess for BigBedZoomsProcess { +impl ChromProcess for BigBedZoomsProcess { type Value = BedEntry; async fn do_process( &mut self, From 41dffe95e643e8ded62973be80ae45aad30e127f Mon Sep 17 00:00:00 2001 From: Jack Huey <31162821+jackh726@users.noreply.github.com> Date: Tue, 7 May 2024 20:09:13 -0400 Subject: [PATCH 22/31] Mostly fix parallelism in bigwigtobedgraph and bigbedtobed --- bigtools/src/bed/bedparser.rs | 4 +- bigtools/src/utils/cli/bigbedtobed.rs | 52 ++++++++++++++-------- bigtools/src/utils/cli/bigwigtobedgraph.rs | 52 ++++++++++++++-------- 3 files changed, 70 insertions(+), 38 deletions(-) diff --git a/bigtools/src/bed/bedparser.rs b/bigtools/src/bed/bedparser.rs index 775603f..f4e1bed 100644 --- a/bigtools/src/bed/bedparser.rs +++ b/bigtools/src/bed/bedparser.rs @@ -17,7 +17,7 @@ use crate::bbi::{BedEntry, Value}; use crate::utils::streaming_linereader::StreamingLineReader; pub fn parse_bed<'a>(s: &'a str) -> Option> { - let mut split = s.splitn(4, '\t'); + let mut split = s.trim_end().splitn(4, '\t'); let chrom = match split.next() { Some(chrom) => chrom, None => return None, @@ -45,7 +45,7 @@ pub fn parse_bed<'a>(s: &'a str) -> Option(s: &'a str) -> Option> { - let mut split = s.splitn(5, '\t'); + let mut split = s.trim_end().splitn(5, '\t'); let chrom = match split.next() { Some(chrom) => chrom, None => return None, diff --git a/bigtools/src/utils/cli/bigbedtobed.rs b/bigtools/src/utils/cli/bigbedtobed.rs index 2ff8027..3724838 100644 --- a/bigtools/src/utils/cli/bigbedtobed.rs +++ b/bigtools/src/utils/cli/bigbedtobed.rs @@ -1,11 +1,10 @@ -use std::collections::VecDeque; use std::error::Error; use std::fs::File; use std::io::{self, BufReader, Write}; use std::path::Path; use clap::Parser; -use futures::FutureExt; +use futures::{SinkExt, StreamExt}; use tokio::runtime; use ufmt::uwrite; @@ -157,8 +156,6 @@ pub fn write_bed( let mut remaining_chroms = bigbed.chroms().to_vec(); remaining_chroms.reverse(); - let mut chrom_files: VecDeque<_> = VecDeque::new(); - async fn file_future( mut bigbed: BigBedRead, chrom: ChromInfo, @@ -186,30 +183,49 @@ pub fn write_bed( Ok(()) } - loop { - while chrom_files.len() < nthreads { + let (mut handle_snd, mut handle_rcv) = futures::channel::mpsc::channel(nthreads); + let (mut buf_snd, mut buf_rcv) = futures::channel::mpsc::unbounded(); + runtime.spawn(async move { + loop { let Some(chrom) = remaining_chroms.pop() else { - break; + return Ok::<_, BBIReadError>(()); }; let bigbed = bigbed.reopen()?; let (buf, file): (TempFileBuffer, TempFileBufferWriter) = TempFileBuffer::new(inmemory); let writer = io::BufWriter::new(file); - let handle = runtime - .spawn(file_future(bigbed, chrom, writer)) - .map(|f| f.unwrap()); - chrom_files.push_back((handle, buf)); + let handle = tokio::task::spawn(file_future(bigbed, chrom, writer)); + + handle_snd.send(handle).await.unwrap(); + buf_snd.send(buf).await.unwrap(); } + }); - let Some((f, mut buf)) = chrom_files.pop_front() else { - break; - }; + let data_handle = runtime.spawn(async move { + loop { + let next = handle_rcv.next().await; + let Some(handle) = next else { + return Ok::<_, BBIReadError>(()); + }; + handle.await.unwrap()?; + } + }); + runtime.block_on(async move { + loop { + let next = buf_rcv.next().await; + let Some(mut buf) = next else { + data_handle.await.unwrap()?; + return Ok::<_, BBIReadError>(()); + }; - buf.switch(out_file); - runtime.block_on(f).unwrap(); - out_file = buf.await_real_file(); - } + buf.switch(out_file); + while !buf.is_real_file_ready() { + tokio::task::yield_now().await; + } + out_file = buf.await_real_file(); + } + })?; Ok(()) } diff --git a/bigtools/src/utils/cli/bigwigtobedgraph.rs b/bigtools/src/utils/cli/bigwigtobedgraph.rs index 458d2a3..14fef3f 100644 --- a/bigtools/src/utils/cli/bigwigtobedgraph.rs +++ b/bigtools/src/utils/cli/bigwigtobedgraph.rs @@ -1,4 +1,3 @@ -use std::collections::VecDeque; use std::error::Error; use std::fs::File; use std::io::{self, BufReader, Write}; @@ -7,7 +6,7 @@ use std::path::Path; use crate::utils::streaming_linereader::StreamingLineReader; use clap::Parser; -use futures::FutureExt; +use futures::{SinkExt, StreamExt}; use crate::utils::reopen::{Reopen, SeekableRead}; use crate::utils::tempfilebuffer::{TempFileBuffer, TempFileBufferWriter}; @@ -158,8 +157,6 @@ pub fn write_bg( let mut remaining_chroms = bigwig.chroms().to_vec(); remaining_chroms.reverse(); - let mut chrom_files: VecDeque<_> = VecDeque::new(); - async fn file_future( mut bigwig: BigWigRead, chrom: ChromInfo, @@ -184,30 +181,49 @@ pub fn write_bg( Ok(()) } - loop { - while chrom_files.len() < nthreads { + let (mut handle_snd, mut handle_rcv) = futures::channel::mpsc::channel(nthreads); + let (mut buf_snd, mut buf_rcv) = futures::channel::mpsc::unbounded(); + runtime.spawn(async move { + loop { let Some(chrom) = remaining_chroms.pop() else { - break; + return Ok::<_, BBIReadError>(()); }; let bigbed = bigwig.reopen()?; let (buf, file): (TempFileBuffer, TempFileBufferWriter) = TempFileBuffer::new(inmemory); let writer = io::BufWriter::new(file); - let handle = runtime - .spawn(file_future(bigbed, chrom, writer)) - .map(|f| f.unwrap()); - chrom_files.push_back((handle, buf)); + let handle = tokio::task::spawn(file_future(bigbed, chrom, writer)); + + handle_snd.send(handle).await.unwrap(); + buf_snd.send(buf).await.unwrap(); } + }); - let Some((f, mut buf)) = chrom_files.pop_front() else { - break; - }; + let data_handle = runtime.spawn(async move { + loop { + let next = handle_rcv.next().await; + let Some(handle) = next else { + return Ok::<_, BBIReadError>(()); + }; + handle.await.unwrap()?; + } + }); + runtime.block_on(async move { + loop { + let next = buf_rcv.next().await; + let Some(mut buf) = next else { + data_handle.await.unwrap()?; + return Ok::<_, BBIReadError>(()); + }; - buf.switch(out_file); - runtime.block_on(f).unwrap(); - out_file = buf.await_real_file(); - } + buf.switch(out_file); + while !buf.is_real_file_ready() { + tokio::task::yield_now().await; + } + out_file = buf.await_real_file(); + } + })?; Ok(()) } From e018c4ae0c63e98f994dfa92e6f74d7d9f8c5868 Mon Sep 17 00:00:00 2001 From: Jack Huey <31162821+jackh726@users.noreply.github.com> Date: Wed, 26 Jun 2024 10:49:57 -0400 Subject: [PATCH 23/31] Cleanup bedchromdata function --- bigtools/src/bbi/bedchromdata.rs | 67 +++++++++++++++----------------- 1 file changed, 32 insertions(+), 35 deletions(-) diff --git a/bigtools/src/bbi/bedchromdata.rs b/bigtools/src/bbi/bedchromdata.rs index 597083c..3c5ac41 100644 --- a/bigtools/src/bbi/bedchromdata.rs +++ b/bigtools/src/bbi/bedchromdata.rs @@ -100,59 +100,56 @@ impl ChromData for BedParserStreamingIterator { advance: &mut Advance, ) -> Result<(), ProcessChromError> { runtime.block_on(async move { - let mut state: Option<(String, P, Option>)> = None; + let first_val = self.bed_data.next(); + let (mut curr_state, mut next_val) = match first_val { + // The first value is an error + Some(Err(e)) => return Err(ProcessChromError::SourceError(e)), + // There are no values at all + None => return Ok(()), + // The next value is the first + Some(Ok((chrom, val))) => { + let chrom = chrom.to_string(); + let mut p = start_processing(chrom.clone())?; + let next_val = self.bed_data.next(); + let next_value = match &next_val { + Some(Ok(v)) if v.0 == chrom => Some(&v.1), + _ => None, + }; + p.do_process(val, next_value).await?; + ((chrom, p), next_val) + } + }; loop { - let (curr_value, new_state) = match state { - Some((c, p, Some(v))) => (Some(v), Some((c, p))), - Some((c, p, None)) => (self.bed_data.next(), Some((c, p))), - None => (self.bed_data.next(), None), - }; - state = match (new_state, curr_value) { - // The next value is an error, but we never started - (None, Some(Err(e))) => return Err(ProcessChromError::SourceError(e)), - // There are no values at all - (None, None) => return Ok(()), + (curr_state, next_val) = match (curr_state, next_val) { // There are no more values - (Some(state), None) => { - advance(state.1)?; + ((_, curr_state), None) => { + advance(curr_state)?; return Ok(()); } // The next value is an error and we have seen values before - (Some(state), Some(Err(e))) => { + ((_, curr_state), Some(Err(e))) => { // We *can* do anything since we've encountered an error. // We'll go ahead and try to finish what we can, before we return. - advance(state.1)?; + advance(curr_state)?; return Err(ProcessChromError::SourceError(e)); } - // The next value is the first - (None, Some(Ok((chrom, val)))) => { - let chrom = chrom.to_string(); - let mut p = start_processing(chrom.clone())?; - let next_val = self.bed_data.next(); - let next_value = match &next_val { - Some(Ok(v)) if v.0 == chrom => Some(&v.1), - _ => None, - }; - p.do_process(val, next_value).await?; - Some((chrom, p, next_val)) - } // The next value is the same chromosome - (Some((prev_chrom, mut p)), Some(Ok((chrom, val)))) if chrom == &prev_chrom => { + ((curr_chrom, mut curr_state), Some(Ok((chrom, val)))) if chrom == &curr_chrom => { let next_val = self.bed_data.next(); let next_value = match &next_val { - Some(Ok(v)) if v.0 == prev_chrom => Some(&v.1), + Some(Ok(v)) if v.0 == curr_chrom => Some(&v.1), _ => None, }; - p.do_process(val, next_value).await?; - Some((prev_chrom, p, next_val)) + curr_state.do_process(val, next_value).await?; + ((curr_chrom, curr_state), next_val) } // The next value is a different chromosome - (Some((prev_chrom, p)), Some(Ok((chrom, val)))) => { + ((curr_chrom, curr_state), Some(Ok((chrom, val)))) => { // TODO: test this correctly fails - if !self.allow_out_of_order_chroms && prev_chrom.as_str() >= chrom { + if !self.allow_out_of_order_chroms && curr_chrom.as_str() >= chrom { return Err(ProcessChromError::SourceError(BedValueError::InvalidInput("Input bedGraph not sorted by chromosome. Sort with `sort -k1,1 -k2,2n`.".to_string()))); } - advance(p)?; + advance(curr_state)?; let chrom = chrom.to_string(); let mut p = start_processing(chrom.clone())?; @@ -163,7 +160,7 @@ impl ChromData for BedParserStreamingIterator { }; p.do_process(val, next_value).await?; - Some((chrom, p, next_val)) + ((chrom, p), next_val) } }; } From 041d652fc55c8c12f9ba43e7bc3348ad4f403795 Mon Sep 17 00:00:00 2001 From: Jack Huey <31162821+jackh726@users.noreply.github.com> Date: Wed, 26 Jun 2024 10:53:26 -0400 Subject: [PATCH 24/31] Eagerly check error --- bigtools/src/bbi/bedchromdata.rs | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/bigtools/src/bbi/bedchromdata.rs b/bigtools/src/bbi/bedchromdata.rs index 3c5ac41..1228ddf 100644 --- a/bigtools/src/bbi/bedchromdata.rs +++ b/bigtools/src/bbi/bedchromdata.rs @@ -111,8 +111,13 @@ impl ChromData for BedParserStreamingIterator { let chrom = chrom.to_string(); let mut p = start_processing(chrom.clone())?; let next_val = self.bed_data.next(); + let next_val = match next_val { + Some(Err(e)) => return Err(ProcessChromError::SourceError(e)), + Some(Ok(v)) => Some(v), + None => None, + }; let next_value = match &next_val { - Some(Ok(v)) if v.0 == chrom => Some(&v.1), + Some(v) if v.0 == chrom => Some(&v.1), _ => None, }; p.do_process(val, next_value).await?; @@ -126,25 +131,23 @@ impl ChromData for BedParserStreamingIterator { advance(curr_state)?; return Ok(()); } - // The next value is an error and we have seen values before - ((_, curr_state), Some(Err(e))) => { - // We *can* do anything since we've encountered an error. - // We'll go ahead and try to finish what we can, before we return. - advance(curr_state)?; - return Err(ProcessChromError::SourceError(e)); - } // The next value is the same chromosome - ((curr_chrom, mut curr_state), Some(Ok((chrom, val)))) if chrom == &curr_chrom => { + ((curr_chrom, mut curr_state), Some((chrom, val))) if chrom == &curr_chrom => { let next_val = self.bed_data.next(); + let next_val = match next_val { + Some(Err(e)) => return Err(ProcessChromError::SourceError(e)), + Some(Ok(v)) => Some(v), + None => None, + }; let next_value = match &next_val { - Some(Ok(v)) if v.0 == curr_chrom => Some(&v.1), + Some(v) if v.0 == curr_chrom => Some(&v.1), _ => None, }; curr_state.do_process(val, next_value).await?; ((curr_chrom, curr_state), next_val) } // The next value is a different chromosome - ((curr_chrom, curr_state), Some(Ok((chrom, val)))) => { + ((curr_chrom, curr_state), Some((chrom, val))) => { // TODO: test this correctly fails if !self.allow_out_of_order_chroms && curr_chrom.as_str() >= chrom { return Err(ProcessChromError::SourceError(BedValueError::InvalidInput("Input bedGraph not sorted by chromosome. Sort with `sort -k1,1 -k2,2n`.".to_string()))); @@ -154,8 +157,13 @@ impl ChromData for BedParserStreamingIterator { let chrom = chrom.to_string(); let mut p = start_processing(chrom.clone())?; let next_val = self.bed_data.next(); + let next_val = match next_val { + Some(Err(e)) => return Err(ProcessChromError::SourceError(e)), + Some(Ok(v)) => Some(v), + None => None, + }; let next_value = match &next_val { - Some(Ok(v)) if v.0 == chrom => Some(&v.1), + Some(v) if v.0 == chrom => Some(&v.1), _ => None, }; From ad8937a161a2e7635e358cd18453ccdfeb850f74 Mon Sep 17 00:00:00 2001 From: Jack Huey <31162821+jackh726@users.noreply.github.com> Date: Wed, 26 Jun 2024 11:18:28 -0400 Subject: [PATCH 25/31] Use &mut instead of passing by val to eliminate memmoves --- bigtools/src/bbi/bedchromdata.rs | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/bigtools/src/bbi/bedchromdata.rs b/bigtools/src/bbi/bedchromdata.rs index 1228ddf..51d23fd 100644 --- a/bigtools/src/bbi/bedchromdata.rs +++ b/bigtools/src/bbi/bedchromdata.rs @@ -125,14 +125,14 @@ impl ChromData for BedParserStreamingIterator { } }; loop { - (curr_state, next_val) = match (curr_state, next_val) { + next_val = match (&mut curr_state, next_val) { // There are no more values - ((_, curr_state), None) => { - advance(curr_state)?; + ((_, _), None) => { + advance(curr_state.1)?; return Ok(()); } // The next value is the same chromosome - ((curr_chrom, mut curr_state), Some((chrom, val))) if chrom == &curr_chrom => { + ((curr_chrom, curr_state), Some((chrom, val))) if chrom == curr_chrom => { let next_val = self.bed_data.next(); let next_val = match next_val { Some(Err(e)) => return Err(ProcessChromError::SourceError(e)), @@ -144,15 +144,16 @@ impl ChromData for BedParserStreamingIterator { _ => None, }; curr_state.do_process(val, next_value).await?; - ((curr_chrom, curr_state), next_val) + next_val } // The next value is a different chromosome - ((curr_chrom, curr_state), Some((chrom, val))) => { + (_, Some((chrom, val))) => { + let (prev_chrom, prev_state) = curr_state; // TODO: test this correctly fails - if !self.allow_out_of_order_chroms && curr_chrom.as_str() >= chrom { + if !self.allow_out_of_order_chroms && prev_chrom.as_str() >= chrom { return Err(ProcessChromError::SourceError(BedValueError::InvalidInput("Input bedGraph not sorted by chromosome. Sort with `sort -k1,1 -k2,2n`.".to_string()))); } - advance(curr_state)?; + advance(prev_state)?; let chrom = chrom.to_string(); let mut p = start_processing(chrom.clone())?; @@ -168,7 +169,8 @@ impl ChromData for BedParserStreamingIterator { }; p.do_process(val, next_value).await?; - ((chrom, p), next_val) + curr_state = (chrom, p); + next_val } }; } From de00dab6669b190ca93ac1bd8ea331a488fe94de Mon Sep 17 00:00:00 2001 From: jackh726 Date: Fri, 28 Jun 2024 22:17:13 +0000 Subject: [PATCH 26/31] Bump to 0.5.0-dev --- Cargo.lock | 2 +- bigtools/Cargo.toml | 2 +- pybigtools/Cargo.toml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 75c86e3..8c5a99e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -109,7 +109,7 @@ checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" [[package]] name = "bigtools" -version = "0.4.4-dev" +version = "0.5.0-dev" dependencies = [ "attohttpc", "bincode", diff --git a/bigtools/Cargo.toml b/bigtools/Cargo.toml index 51b28fa..9618343 100644 --- a/bigtools/Cargo.toml +++ b/bigtools/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "bigtools" -version = "0.4.4-dev" +version = "0.5.0-dev" authors = ["Jack Huey "] edition = "2021" license = "MIT" diff --git a/pybigtools/Cargo.toml b/pybigtools/Cargo.toml index 38164d1..7e156e9 100644 --- a/pybigtools/Cargo.toml +++ b/pybigtools/Cargo.toml @@ -9,7 +9,7 @@ name = "pybigtools" crate-type = ["cdylib"] [dependencies] -bigtools = { version = "0.4.4-dev", path = "../bigtools", default_features = false, features = ["read", "write"] } +bigtools = { version = "0.5.0-dev", path = "../bigtools", default_features = false, features = ["read", "write"] } url = "2.4.0" tokio = { version = "1.34.0", features = ["rt", "rt-multi-thread"] } futures = { version = "0.3.1", features = ["thread-pool"] } From 29cef5ffa4c30aa865875b01019965ed34fe4723 Mon Sep 17 00:00:00 2001 From: Jack Huey <31162821+jackh726@users.noreply.github.com> Date: Thu, 4 Jul 2024 01:11:42 -0400 Subject: [PATCH 27/31] Rename ChromData to BBIDataSource and ChromProcess to BBIDataProcessor. Cleanup docs and rename chsi vars to data. --- bigtools/src/bbi.rs | 2 +- bigtools/src/bbi/bbiwrite.rs | 72 ++++++++++--------- .../src/bbi/{bedchromdata.rs => beddata.rs} | 28 ++++---- bigtools/src/bbi/bigbedwrite.rs | 36 +++++----- bigtools/src/bbi/bigwigwrite.rs | 38 +++++----- bigtools/src/lib.rs | 26 +++---- bigtools/src/utils/cli/bedgraphtobigwig.rs | 10 +-- bigtools/src/utils/cli/bedtobigbed.rs | 24 +++---- bigtools/src/utils/cli/bigwigmerge.rs | 8 +-- bigtools/tests/bigbedwrite.rs | 6 +- bigtools/tests/bigwigwrite.rs | 14 ++-- pybigtools/src/lib.rs | 10 +-- 12 files changed, 136 insertions(+), 138 deletions(-) rename bigtools/src/bbi/{bedchromdata.rs => beddata.rs} (93%) diff --git a/bigtools/src/bbi.rs b/bigtools/src/bbi.rs index d4fb53f..928f8fd 100644 --- a/bigtools/src/bbi.rs +++ b/bigtools/src/bbi.rs @@ -3,7 +3,7 @@ pub(crate) mod bbiread; #[cfg(feature = "write")] pub(crate) mod bbiwrite; #[cfg(feature = "write")] -pub mod bedchromdata; +pub mod beddata; #[cfg(feature = "read")] pub(crate) mod bigbedread; #[cfg(feature = "write")] diff --git a/bigtools/src/bbi/bbiwrite.rs b/bigtools/src/bbi/bbiwrite.rs index bab825c..60e34e2 100644 --- a/bigtools/src/bbi/bbiwrite.rs +++ b/bigtools/src/bbi/bbiwrite.rs @@ -126,7 +126,7 @@ pub(crate) struct TempZoomInfo { pub sections: crossbeam_channel::Receiver
, } -pub(crate) type ChromProcessingInputSectionChannel = +pub(crate) type BBIDataProcessoringInputSectionChannel = futures_mpsc::Sender>>; const MAX_ZOOM_LEVELS: usize = 10; @@ -537,23 +537,25 @@ pub(crate) fn write_zooms( Ok(zoom_entries) } -/// Potential states encountered when reading `ChromData` -pub enum ChromDataState { - /// We've encountered a new chromosome - NewChrom(ChromOutput), - Finished, - Error(Error), -} - -pub struct ChromProcessedData(pub(crate) Summary); - -/// Effectively like an Iterator of chromosome data -pub trait ChromData: Sized { +pub(crate) struct BBIDataProcessoredData(pub(crate) Summary); + +/// This trait abstracts over processing the data for a bbi file. Generally, +/// users should not need to implement this directly, but rather use provided +/// structs like `BedParserStreamingIterator`. However, this does provide a +/// lower-level API that can be useful for custom value generation or +/// scheduling logic. +/// +/// When `process_to_bbi` is called, it is expected that the function +/// `start_processing` is called with the chromosome name in-order. This +/// function returns a `Result` for `P`, which represents the active state for +/// writing the data for that chromosome. For each value, `do_process` should +/// be called, which writes it to the file. +pub trait BBIDataSource: Sized { type Value; type Error: Error + Send + 'static; fn process_to_bbi< - P: ChromProcess + Send + 'static, + P: BBIDataProcessor + Send + 'static, StartProcessing: FnMut(String) -> Result>, Advance: FnMut(P) -> Result<(), ProcessChromError>, >( @@ -674,8 +676,8 @@ async fn write_chroms_without_zooms( } pub struct InternalProcessData( - pub(crate) Vec<(u32, ChromProcessingInputSectionChannel)>, - pub(crate) ChromProcessingInputSectionChannel, + pub(crate) Vec<(u32, BBIDataProcessoringInputSectionChannel)>, + pub(crate) BBIDataProcessoringInputSectionChannel, pub(crate) u32, pub(crate) BBIWriteOptions, pub(crate) Handle, @@ -684,7 +686,7 @@ pub struct InternalProcessData( ); pub(crate) mod process_internal { - pub trait ChromProcessCreate { + pub trait BBIDataProcessorCreate { type I; type Out; fn create(internal_data: Self::I) -> Self; @@ -692,7 +694,7 @@ pub(crate) mod process_internal { } } -pub trait ChromProcess: process_internal::ChromProcessCreate { +pub trait BBIDataProcessor: process_internal::BBIDataProcessorCreate { type Value: Send + 'static; fn do_process( &mut self, @@ -702,10 +704,12 @@ pub trait ChromProcess: process_internal::ChromProcessCreate { } pub(crate) fn write_vals< - V: ChromData, - P: ChromProcess - + process_internal::ChromProcessCreate - + Send + V: BBIDataSource, + P: BBIDataProcessor + + process_internal::BBIDataProcessorCreate< + I = InternalProcessData, + Out = BBIDataProcessoredData, + > + Send + 'static, >( mut vals_iter: V, @@ -753,8 +757,8 @@ pub(crate) fn write_vals< options: BBIWriteOptions, runtime: &Runtime, ) -> ( - Vec<(u32, ChromProcessingInputSectionChannel)>, - ChromProcessingInputSectionChannel, + Vec<(u32, BBIDataProcessoringInputSectionChannel)>, + BBIDataProcessoringInputSectionChannel, ) { let (ftx, sections_handle, buf, section_receiver) = future_channel(options.channel_size, runtime.handle(), options.inmemory); @@ -815,7 +819,7 @@ pub(crate) fn write_vals< let mut advance = |p: P| { let data = p.destroy(); - let ChromProcessedData(chrom_summary) = data; + let BBIDataProcessoredData(chrom_summary) = data; match &mut summary { None => summary = Some(chrom_summary), Some(summary) => { @@ -870,7 +874,7 @@ pub(crate) fn write_vals< } pub(crate) struct NoZoomsInternalProcessData( - pub(crate) ChromProcessingInputSectionChannel, + pub(crate) BBIDataProcessoringInputSectionChannel, pub(crate) u32, pub(crate) BBIWriteOptions, pub(crate) Handle, @@ -880,9 +884,9 @@ pub(crate) struct NoZoomsInternalProcessData( pub(crate) struct NoZoomsInternalProcessedData(pub(crate) Summary, pub(crate) Vec<(u64, u64)>); pub(crate) fn write_vals_no_zoom< - V: ChromData, - P: ChromProcess - + process_internal::ChromProcessCreate< + V: BBIDataSource, + P: BBIDataProcessor + + process_internal::BBIDataProcessorCreate< I = NoZoomsInternalProcessData, Out = NoZoomsInternalProcessedData, > + Send @@ -1020,7 +1024,7 @@ pub(crate) struct InternalTempZoomInfo { pub(crate) struct ZoomsInternalProcessData( pub(crate) Vec>, - pub(crate) Vec<(u32, ChromProcessingInputSectionChannel)>, + pub(crate) Vec<(u32, BBIDataProcessoringInputSectionChannel)>, pub(crate) u32, pub(crate) BBIWriteOptions, pub(crate) Handle, @@ -1028,9 +1032,9 @@ pub(crate) struct ZoomsInternalProcessData( pub(crate) struct ZoomsInternalProcessedData(pub(crate) Vec>); pub(crate) fn write_zoom_vals< - V: ChromData, - P: ChromProcess - + process_internal::ChromProcessCreate< + V: BBIDataSource, + P: BBIDataProcessor + + process_internal::BBIDataProcessorCreate< I = ZoomsInternalProcessData, Out = ZoomsInternalProcessedData, > + Send @@ -1301,7 +1305,7 @@ pub(crate) fn future_channel ( - ChromProcessingInputSectionChannel, + BBIDataProcessoringInputSectionChannel, tokio::task::JoinHandle>>, TempFileBuffer, crossbeam_channel::Receiver
, diff --git a/bigtools/src/bbi/bedchromdata.rs b/bigtools/src/bbi/beddata.rs similarity index 93% rename from bigtools/src/bbi/bedchromdata.rs rename to bigtools/src/bbi/beddata.rs index 51d23fd..d7ed779 100644 --- a/bigtools/src/bbi/bedchromdata.rs +++ b/bigtools/src/bbi/beddata.rs @@ -19,7 +19,7 @@ use crate::bed::bedparser::{ }; use crate::utils::file_view::FileView; use crate::utils::streaming_linereader::StreamingLineReader; -use crate::{BedEntry, ChromData, ChromProcess, ProcessChromError, Value}; +use crate::{BBIDataProcessor, BBIDataSource, BedEntry, ProcessChromError, Value}; pub struct BedParserStreamingIterator { bed_data: S, @@ -85,12 +85,12 @@ impl + for<'a> PartialEq<&'a str>, I: Iterator ChromData for BedParserStreamingIterator { +impl BBIDataSource for BedParserStreamingIterator { type Value = S::Value; type Error = BedValueError; fn process_to_bbi< - P: ChromProcess, + P: BBIDataProcessor, StartProcessing: FnMut(String) -> Result>, Advance: FnMut(P) -> Result<(), ProcessChromError>, >( @@ -207,12 +207,12 @@ impl BedParserParallelStreamingIterator { } } -impl ChromData for BedParserParallelStreamingIterator { +impl BBIDataSource for BedParserParallelStreamingIterator { type Value = V; type Error = BedValueError; fn process_to_bbi< - P: ChromProcess + Send + 'static, + P: BBIDataProcessor + Send + 'static, StartProcessing: FnMut(String) -> Result>, Advance: FnMut(P) -> Result<(), ProcessChromError>, >( @@ -300,7 +300,7 @@ impl ChromData for BedParserParallelStreamingIterator { mod tests { use super::*; use crate::bed::bedparser::parse_bedgraph; - use crate::process_internal::ChromProcessCreate; + use crate::process_internal::BBIDataProcessorCreate; use crate::{ProcessChromError, Value}; use std::fs::File; use std::io; @@ -315,7 +315,7 @@ mod tests { let chrom_indices: Vec<(u64, String)> = crate::bed::indexer::index_chroms(File::open(dir.clone())?)?.unwrap(); - let mut chsi = BedParserParallelStreamingIterator::new( + let mut data = BedParserParallelStreamingIterator::new( chrom_indices, true, PathBuf::from(dir.clone()), @@ -323,18 +323,18 @@ mod tests { ); let runtime = tokio::runtime::Builder::new_multi_thread().build().unwrap(); let mut counts = vec![]; - struct TestChromProcess { + struct TestBBIDataProcessor { count: usize, } - impl ChromProcessCreate for TestChromProcess { + impl BBIDataProcessorCreate for TestBBIDataProcessor { type I = (); type Out = (); fn create(_: Self::I) -> Self { - TestChromProcess { count: 0 } + TestBBIDataProcessor { count: 0 } } fn destroy(self) -> Self::Out {} } - impl ChromProcess for TestChromProcess { + impl BBIDataProcessor for TestBBIDataProcessor { type Value = Value; async fn do_process( &mut self, @@ -346,13 +346,13 @@ mod tests { Ok(()) } } - let mut start_processing = |_: String| Ok(TestChromProcess::create(())); - let mut advance = |p: TestChromProcess| { + let mut start_processing = |_: String| Ok(TestBBIDataProcessor::create(())); + let mut advance = |p: TestBBIDataProcessor| { counts.push(p.count); let _ = p.destroy(); Ok(()) }; - chsi.process_to_bbi(&runtime, &mut start_processing, &mut advance) + data.process_to_bbi(&runtime, &mut start_processing, &mut advance) .unwrap(); assert_eq!(counts, vec![200, 200, 200, 200, 200, 2000]); diff --git a/bigtools/src/bbi/bigbedwrite.rs b/bigtools/src/bbi/bigbedwrite.rs index 98e8326..c3d6951 100644 --- a/bigtools/src/bbi/bigbedwrite.rs +++ b/bigtools/src/bbi/bigbedwrite.rs @@ -9,12 +9,12 @@ use futures::sink::SinkExt; use byteorder::{NativeEndian, WriteBytesExt}; use tokio::runtime::{Handle, Runtime}; -use crate::bbiwrite::process_internal::ChromProcessCreate; +use crate::bbiwrite::process_internal::BBIDataProcessorCreate; use crate::utils::indexlist::IndexList; use crate::utils::tell::Tell; use crate::{ - write_info, ChromData, ChromProcess, ChromProcessedData, ChromProcessingInputSectionChannel, - InternalProcessData, InternalTempZoomInfo, NoZoomsInternalProcessData, + write_info, BBIDataProcessor, BBIDataProcessoredData, BBIDataProcessoringInputSectionChannel, + BBIDataSource, InternalProcessData, InternalTempZoomInfo, NoZoomsInternalProcessData, NoZoomsInternalProcessedData, ZoomsInternalProcessData, ZoomsInternalProcessedData, }; @@ -80,7 +80,7 @@ impl BigBedWrite { } /// Write the values from `V` as a bigWig. Will utilize the provided runtime for encoding values and for reading through the values (potentially parallelized by chromosome). - pub fn write>( + pub fn write>( self, chrom_sizes: HashMap, vals: V, @@ -140,7 +140,7 @@ impl BigBedWrite { /// Write the values from `V` as a bigBed. Will utilize the provided runtime for encoding values and for reading through the values (potentially parallelized by chromosome). /// This will take two passes on the provided values: first to write the values themselves, then the zooms. This is beneficial over `write` on smaller files, where the encoding of /// high resolution zooms takes up a substantial portion of total processing time. - pub fn write_multipass>( + pub fn write_multipass>( self, make_vals: impl Fn() -> Result>, chrom_sizes: HashMap, @@ -221,7 +221,7 @@ impl BigBedWrite { overlap: &mut IndexList, options: BBIWriteOptions, runtime: &Handle, - ftx: &mut ChromProcessingInputSectionChannel, + ftx: &mut BBIDataProcessoringInputSectionChannel, chrom_id: u32, ) -> Result<(), ProcessChromError> { // Check a few preconditions: @@ -525,7 +525,7 @@ struct ZoomItem { live_info: Option<(ZoomRecord, u64)>, overlap: IndexList, records: Vec, - channel: ChromProcessingInputSectionChannel, + channel: BBIDataProcessoringInputSectionChannel, } struct EntriesSection { items: Vec, @@ -538,7 +538,7 @@ pub(crate) struct BigBedFullProcess { state_val: EntriesSection, total_items: u64, - ftx: ChromProcessingInputSectionChannel, + ftx: BBIDataProcessoringInputSectionChannel, chrom_id: u32, options: BBIWriteOptions, runtime: Handle, @@ -546,10 +546,10 @@ pub(crate) struct BigBedFullProcess { length: u32, } -impl ChromProcessCreate for BigBedFullProcess { +impl BBIDataProcessorCreate for BigBedFullProcess { type I = InternalProcessData; - type Out = ChromProcessedData; - fn destroy(self) -> ChromProcessedData { + type Out = BBIDataProcessoredData; + fn destroy(self) -> BBIDataProcessoredData { let Self { summary, total_items, @@ -575,7 +575,7 @@ impl ChromProcessCreate for BigBedFullProcess { Some(summary) => summary, }; summary_complete.total_items = total_items; - ChromProcessedData(summary_complete) + BBIDataProcessoredData(summary_complete) } fn create(internal_data: InternalProcessData) -> Self { let InternalProcessData(zooms_channels, ftx, chrom_id, options, runtime, chrom, length) = @@ -612,7 +612,7 @@ impl ChromProcessCreate for BigBedFullProcess { } } } -impl ChromProcess for BigBedFullProcess { +impl BBIDataProcessor for BigBedFullProcess { type Value = BedEntry; async fn do_process( &mut self, @@ -675,7 +675,7 @@ struct ZoomCounts { counts: u64, } struct BigBedNoZoomsProcess { - ftx: ChromProcessingInputSectionChannel, + ftx: BBIDataProcessoringInputSectionChannel, chrom_id: u32, options: BBIWriteOptions, runtime: Handle, @@ -689,7 +689,7 @@ struct BigBedNoZoomsProcess { total_items: u64, } -impl ChromProcessCreate for BigBedNoZoomsProcess { +impl BBIDataProcessorCreate for BigBedNoZoomsProcess { type I = NoZoomsInternalProcessData; type Out = NoZoomsInternalProcessedData; fn create(internal_data: Self::I) -> Self { @@ -752,7 +752,7 @@ impl ChromProcessCreate for BigBedNoZoomsProcess { } } -impl ChromProcess for BigBedNoZoomsProcess { +impl BBIDataProcessor for BigBedNoZoomsProcess { type Value = BedEntry; async fn do_process( &mut self, @@ -817,7 +817,7 @@ struct BigBedZoomsProcess { zoom_items: Vec, } -impl ChromProcessCreate for BigBedZoomsProcess { +impl BBIDataProcessorCreate for BigBedZoomsProcess { type I = ZoomsInternalProcessData; type Out = ZoomsInternalProcessedData; fn create(internal_data: Self::I) -> Self { @@ -854,7 +854,7 @@ impl ChromProcessCreate for BigBedZoomsProcess { ZoomsInternalProcessedData(self.temp_zoom_items) } } -impl ChromProcess for BigBedZoomsProcess { +impl BBIDataProcessor for BigBedZoomsProcess { type Value = BedEntry; async fn do_process( &mut self, diff --git a/bigtools/src/bbi/bigwigwrite.rs b/bigtools/src/bbi/bigwigwrite.rs index 0e45ef8..8cb2063 100644 --- a/bigtools/src/bbi/bigwigwrite.rs +++ b/bigtools/src/bbi/bigwigwrite.rs @@ -8,7 +8,7 @@ Provides the interface for writing bigWig files. # use std::path::PathBuf; # use std::fs::File; # use bigtools::BigWigWrite; -# use bigtools::bedchromdata::BedParserStreamingIterator; +# use bigtools::beddata::BedParserStreamingIterator; # fn main() -> Result<(), Box> { # let mut dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")); # dir.push("resources/test"); @@ -49,11 +49,11 @@ use futures::sink::SinkExt; use byteorder::{NativeEndian, WriteBytesExt}; use tokio::runtime::{Handle, Runtime}; -use crate::bbiwrite::process_internal::ChromProcessCreate; +use crate::bbiwrite::process_internal::BBIDataProcessorCreate; use crate::utils::tell::Tell; use crate::{ - write_info, ChromData, ChromProcess, ChromProcessedData, ChromProcessingInputSectionChannel, - InternalProcessData, InternalTempZoomInfo, NoZoomsInternalProcessData, + write_info, BBIDataProcessor, BBIDataProcessoredData, BBIDataProcessoringInputSectionChannel, + BBIDataSource, InternalProcessData, InternalTempZoomInfo, NoZoomsInternalProcessData, NoZoomsInternalProcessedData, ZoomsInternalProcessData, ZoomsInternalProcessedData, }; @@ -70,7 +70,7 @@ struct ZoomItem { live_info: Option, // All zoom entries in the current section records: Vec, - channel: ChromProcessingInputSectionChannel, + channel: BBIDataProcessoringInputSectionChannel, } /// The struct used to write a bigWig file @@ -110,7 +110,7 @@ impl BigWigWrite { } /// Write the values from `V` as a bigWig. Will utilize the provided runtime for encoding values and for reading through the values (potentially parallelized by chromosome). - pub fn write>( + pub fn write>( self, chrom_sizes: HashMap, vals: V, @@ -175,7 +175,7 @@ impl BigWigWrite { /// Write the values from `V` as a bigWig. Will utilize the provided runtime for encoding values and for reading through the values (potentially parallelized by chromosome). /// This will take two passes on the provided values: first to write the values themselves, then the zooms. This is beneficial over `write` on smaller files, where the encoding of /// high resolution zooms takes up a substantial portion of total processing time. - pub fn write_multipass>( + pub fn write_multipass>( self, make_vals: impl Fn() -> Result>, chrom_sizes: HashMap, @@ -253,7 +253,7 @@ impl BigWigWrite { items: &mut Vec, options: BBIWriteOptions, runtime: &Handle, - ftx: &mut ChromProcessingInputSectionChannel, + ftx: &mut BBIDataProcessoringInputSectionChannel, chrom_id: u32, ) -> Result<(), BigWigInvalidInput> { // Check a few preconditions: @@ -404,7 +404,7 @@ pub(crate) struct BigWigFullProcess { items: Vec, zoom_items: Vec, - ftx: ChromProcessingInputSectionChannel, + ftx: BBIDataProcessoringInputSectionChannel, chrom_id: u32, options: BBIWriteOptions, runtime: Handle, @@ -412,9 +412,9 @@ pub(crate) struct BigWigFullProcess { length: u32, } -impl ChromProcessCreate for BigWigFullProcess { +impl BBIDataProcessorCreate for BigWigFullProcess { type I = InternalProcessData; - type Out = ChromProcessedData; + type Out = BBIDataProcessoredData; fn create(internal_data: InternalProcessData) -> Self { let InternalProcessData(zooms_channels, ftx, chrom_id, options, runtime, chrom, length) = internal_data; @@ -451,7 +451,7 @@ impl ChromProcessCreate for BigWigFullProcess { length, } } - fn destroy(self) -> ChromProcessedData { + fn destroy(self) -> BBIDataProcessoredData { let Self { mut summary, items, @@ -469,11 +469,11 @@ impl ChromProcessCreate for BigWigFullProcess { summary.min_val = 0.0; summary.max_val = 0.0; } - ChromProcessedData(summary) + BBIDataProcessoredData(summary) } } -impl ChromProcess for BigWigFullProcess { +impl BBIDataProcessor for BigWigFullProcess { type Value = Value; async fn do_process( &mut self, @@ -530,7 +530,7 @@ struct ZoomCounts { counts: u64, } struct BigWigNoZoomsProcess { - ftx: ChromProcessingInputSectionChannel, + ftx: BBIDataProcessoringInputSectionChannel, chrom_id: u32, options: BBIWriteOptions, runtime: Handle, @@ -542,7 +542,7 @@ struct BigWigNoZoomsProcess { zoom_counts: Vec, } -impl ChromProcessCreate for BigWigNoZoomsProcess { +impl BBIDataProcessorCreate for BigWigNoZoomsProcess { type I = NoZoomsInternalProcessData; type Out = NoZoomsInternalProcessedData; fn create(internal_data: Self::I) -> Self { @@ -604,7 +604,7 @@ impl ChromProcessCreate for BigWigNoZoomsProcess { } } -impl ChromProcess for BigWigNoZoomsProcess { +impl BBIDataProcessor for BigWigNoZoomsProcess { type Value = Value; async fn do_process( &mut self, @@ -661,7 +661,7 @@ struct BigWigZoomsProcess { zoom_items: Vec, } -impl ChromProcessCreate for BigWigZoomsProcess { +impl BBIDataProcessorCreate for BigWigZoomsProcess { type I = ZoomsInternalProcessData; type Out = ZoomsInternalProcessedData; fn create(internal_data: Self::I) -> Self { @@ -697,7 +697,7 @@ impl ChromProcessCreate for BigWigZoomsProcess { ZoomsInternalProcessedData(self.temp_zoom_items) } } -impl ChromProcess for BigWigZoomsProcess { +impl BBIDataProcessor for BigWigZoomsProcess { type Value = Value; async fn do_process( &mut self, diff --git a/bigtools/src/lib.rs b/bigtools/src/lib.rs index dfaa484..5df2258 100644 --- a/bigtools/src/lib.rs +++ b/bigtools/src/lib.rs @@ -27,22 +27,16 @@ Generally, bigWig and bigBed writing is done per chromosome, with compression and io being done on an async Runtime. The source for data to be written to bigWigs and bigBeds come from the -[`ChromData`] trait. It's effectively like a powerful `Iterator` of -`ChromData::Output` values, which itself is like peekable `Iterator` of values -(either [`Value`]s or [`BedEntry`]s for bigWigs or bigBeds respectively) over a -chromosome. The [`ChromData::advance`] method takes a function that the current -chromosome and returns a `Future` that will asynchronously process chromosomal -data. This data can be returned immediately from the `advance` method, or can -be stored (with the correct implementation) to queue multiple chromosomes -simulatenously. The -[`BedParserStreamingIterator`][crate::bbi::bedchromdata::BedParserStreamingIterator] -and -[`BedParserParallelStreamingIterator`][crate::bbi::bedchromdata::BedParserParallelStreamingIterator] -types provide serial processing of a bed-like value stream (either from a -file or an iterator) or concurrent processing from a file. Generally, these -underlying details aren't necessary unless implementing a new data source. - -Given some implementation of [`ChromData`] (like [`BedParserStreamingIterator`][crate::bbi::bedchromdata::BedParserStreamingIterator]), +[`BBIDataSource`] trait. It is used to abstracts over processing the data +for a bbi file. It is a lower-level API that can be useful for custom value +generation or scheduling logic. Generally though, users should not need to +implement this directly, but rather use provided structs [`BedParserStreamingIterator`][crate::bbi::beddata::BedParserStreamingIterator] +and [`BedParserParallelStreamingIterator`][crate::bbi::beddata::BedParserParallelStreamingIterator] +types providing serial processing of a bed-like value stream (either from a +file or an iterator) or concurrent processing from a file. See the documentation on the trait for more +detailed information on how to implement. + +Given some implementation of [`BBIDataSource`] (like [`BedParserStreamingIterator`][crate::bbi::beddata::BedParserStreamingIterator]), a bigWig can be created using [`BigWigWrite::write`] or a bigBed with [`BigBedWrite::write`]. Both take a map of chromosome sizes, the aforementioned data, and a `Runtime` to spawn processing on. diff --git a/bigtools/src/utils/cli/bedgraphtobigwig.rs b/bigtools/src/utils/cli/bedgraphtobigwig.rs index 0eef7a3..eb7d54b 100644 --- a/bigtools/src/utils/cli/bedgraphtobigwig.rs +++ b/bigtools/src/utils/cli/bedgraphtobigwig.rs @@ -6,7 +6,7 @@ use tokio::runtime; use crate::bed::bedparser::parse_bedgraph; use crate::bed::indexer::index_chroms; -use crate::bedchromdata::{BedParserParallelStreamingIterator, BedParserStreamingIterator}; +use crate::beddata::{BedParserParallelStreamingIterator, BedParserStreamingIterator}; use crate::{BigWigWrite, InputSortType}; use super::BBIWriteArgs; @@ -133,24 +133,24 @@ pub fn bedgraphtobigwig(args: BedGraphToBigWigArgs) -> Result<(), Box }; if let Some(chrom_indices) = chrom_indices { if args.single_pass { - let chsi = BedParserParallelStreamingIterator::new( + let data = BedParserParallelStreamingIterator::new( chrom_indices, allow_out_of_order_chroms, PathBuf::from(bedgraphpath), parse_bedgraph, ); - outb.write(chrom_map, chsi, runtime)?; + outb.write(chrom_map, data, runtime)?; } else { outb.write_multipass( || { - let chsi = BedParserParallelStreamingIterator::new( + let data = BedParserParallelStreamingIterator::new( chrom_indices.clone(), allow_out_of_order_chroms, PathBuf::from(bedgraphpath.clone()), parse_bedgraph, ); - Ok(chsi) + Ok(data) }, chrom_map, runtime, diff --git a/bigtools/src/utils/cli/bedtobigbed.rs b/bigtools/src/utils/cli/bedtobigbed.rs index c238047..45723aa 100644 --- a/bigtools/src/utils/cli/bedtobigbed.rs +++ b/bigtools/src/utils/cli/bedtobigbed.rs @@ -9,8 +9,8 @@ use tokio::runtime; use crate::bed::bedparser::{parse_bed, BedFileStream, StreamingBedValues}; use crate::bed::indexer::index_chroms; -use crate::bedchromdata::BedParserParallelStreamingIterator; -use crate::{bedchromdata::BedParserStreamingIterator, BigBedWrite, InputSortType}; +use crate::beddata::BedParserParallelStreamingIterator; +use crate::{beddata::BedParserStreamingIterator, BigBedWrite, InputSortType}; use super::BBIWriteArgs; @@ -116,8 +116,8 @@ pub fn bedtobigbed(args: BedToBigBedArgs) -> Result<(), Box> { let allow_out_of_order_chroms = !matches!(outb.options.input_sort_type, InputSortType::ALL); if bedpath == "-" || bedpath == "stdin" { let stdin = std::io::stdin().lock(); - let chsi = BedParserStreamingIterator::from_bed_file(stdin, allow_out_of_order_chroms); - outb.write(chrom_map, chsi, runtime)?; + let data = BedParserStreamingIterator::from_bed_file(stdin, allow_out_of_order_chroms); + outb.write(chrom_map, data, runtime)?; } else { let infile = File::open(&bedpath)?; let (parallel, parallel_required) = match (nthreads, args.parallel.as_ref()) { @@ -150,24 +150,24 @@ pub fn bedtobigbed(args: BedToBigBedArgs) -> Result<(), Box> { }; if let Some(chrom_indices) = chrom_indices { if args.single_pass { - let chsi = BedParserParallelStreamingIterator::new( + let data = BedParserParallelStreamingIterator::new( chrom_indices, allow_out_of_order_chroms, PathBuf::from(bedpath), parse_bed, ); - outb.write(chrom_map, chsi, runtime)?; + outb.write(chrom_map, data, runtime)?; } else { outb.write_multipass( || { - let chsi = BedParserParallelStreamingIterator::new( + let data = BedParserParallelStreamingIterator::new( chrom_indices.clone(), allow_out_of_order_chroms, PathBuf::from(bedpath.clone()), parse_bed, ); - Ok(chsi) + Ok(data) }, chrom_map, runtime, @@ -176,19 +176,19 @@ pub fn bedtobigbed(args: BedToBigBedArgs) -> Result<(), Box> { } else { let infile = File::open(&bedpath)?; if args.single_pass { - let chsi = + let data = BedParserStreamingIterator::from_bed_file(infile, allow_out_of_order_chroms); - outb.write(chrom_map, chsi, runtime)?; + outb.write(chrom_map, data, runtime)?; } else { outb.write_multipass( || { let infile = File::open(&bedpath)?; - let chsi = BedParserStreamingIterator::from_bed_file( + let data = BedParserStreamingIterator::from_bed_file( infile, allow_out_of_order_chroms, ); - Ok(chsi) + Ok(data) }, chrom_map, runtime, diff --git a/bigtools/src/utils/cli/bigwigmerge.rs b/bigtools/src/utils/cli/bigwigmerge.rs index 91c97d4..d1f894e 100644 --- a/bigtools/src/utils/cli/bigwigmerge.rs +++ b/bigtools/src/utils/cli/bigwigmerge.rs @@ -9,8 +9,8 @@ use thiserror::Error; use crate::utils::merge::merge_sections_many; use crate::utils::reopen::ReopenableFile; -use crate::{BBIReadError, BigWigRead, BigWigWrite, ChromProcess}; -use crate::{ChromData, ProcessChromError, Value}; +use crate::{BBIDataProcessor, BBIReadError, BigWigRead, BigWigWrite}; +use crate::{BBIDataSource, ProcessChromError, Value}; use tokio::runtime::{self, Runtime}; use super::BBIWriteArgs; @@ -345,12 +345,12 @@ struct ChromGroupReadImpl { iter: Box> + Send>, } -impl ChromData for ChromGroupReadImpl { +impl BBIDataSource for ChromGroupReadImpl { type Value = Value; type Error = MergingValuesError; fn process_to_bbi< - P: ChromProcess, + P: BBIDataProcessor, StartProcessing: FnMut(String) -> Result>, Advance: FnMut(P) -> Result<(), ProcessChromError>, >( diff --git a/bigtools/tests/bigbedwrite.rs b/bigtools/tests/bigbedwrite.rs index f2e89b4..7c3f18f 100644 --- a/bigtools/tests/bigbedwrite.rs +++ b/bigtools/tests/bigbedwrite.rs @@ -1,7 +1,7 @@ use std::error::Error; use bigtools::bed::bedparser::{BedFileStream, StreamingBedValues}; -use bigtools::bedchromdata::BedParserStreamingIterator; +use bigtools::beddata::BedParserStreamingIterator; use tokio::runtime; #[test] @@ -48,8 +48,8 @@ fn bigbedwrite_test() -> Result<(), Box> { chrom_map.insert("chr19".to_string(), 58617616); let infile = File::open(bed)?; - let chsi = BedParserStreamingIterator::from_bed_file(infile, false); - outb.write(chrom_map, chsi, runtime).unwrap(); + let data = BedParserStreamingIterator::from_bed_file(infile, false); + outb.write(chrom_map, data, runtime).unwrap(); let mut bwread = BigBedRead::open_file(&tempfile.path().to_string_lossy()).unwrap(); diff --git a/bigtools/tests/bigwigwrite.rs b/bigtools/tests/bigwigwrite.rs index 3a7deeb..756cbe4 100644 --- a/bigtools/tests/bigwigwrite.rs +++ b/bigtools/tests/bigwigwrite.rs @@ -7,7 +7,7 @@ use std::path::PathBuf; use tempfile; use bigtools::bed::bedparser::{BedFileStream, StreamingBedValues}; -use bigtools::bedchromdata::BedParserStreamingIterator; +use bigtools::beddata::BedParserStreamingIterator; use bigtools::{BigWigRead, BigWigWrite, Value}; use tokio::runtime; @@ -37,8 +37,8 @@ fn test() -> Result<(), Box> { let mut chrom_map = HashMap::new(); chrom_map.insert("chr17".to_string(), 83257441); - let chsi = BedParserStreamingIterator::from_bedgraph_file(infile, false); - outb.write(chrom_map, chsi, runtime).unwrap(); + let data = BedParserStreamingIterator::from_bedgraph_file(infile, false); + outb.write(chrom_map, data, runtime).unwrap(); let mut bwread = BigWigRead::open_file(&tempfile.path().to_string_lossy()).unwrap(); @@ -85,8 +85,8 @@ fn test_multi_pass() -> Result<(), Box> { outb.write_multipass( || { let infile = File::open(single_chrom_bedgraph.clone())?; - let chsi = BedParserStreamingIterator::from_bedgraph_file(infile, false); - Ok(chsi) + let data = BedParserStreamingIterator::from_bedgraph_file(infile, false); + Ok(data) }, chrom_map, runtime, @@ -134,8 +134,8 @@ fn test_multi_chrom() -> io::Result<()> { chrom_map.insert("chr5".to_string(), 181538259); chrom_map.insert("chr6".to_string(), 170805979); - let chsi = BedParserStreamingIterator::from_bedgraph_file(infile, false); - outb.write(chrom_map, chsi, runtime).unwrap(); + let data = BedParserStreamingIterator::from_bedgraph_file(infile, false); + outb.write(chrom_map, data, runtime).unwrap(); let mut bwread = BigWigRead::open_file(&tempfile.path().to_string_lossy()).unwrap(); diff --git a/pybigtools/src/lib.rs b/pybigtools/src/lib.rs index 51bc1c4..daba1c3 100644 --- a/pybigtools/src/lib.rs +++ b/pybigtools/src/lib.rs @@ -7,7 +7,7 @@ use std::ops::IndexMut; use std::path::Path; use bigtools::bed::autosql::parse::parse_autosql; -use bigtools::bedchromdata::BedParserStreamingIterator; +use bigtools::beddata::BedParserStreamingIterator; #[cfg(feature = "remote")] use bigtools::utils::file::remote_file::RemoteFile; use bigtools::utils::file::reopen::ReopenableFile; @@ -2026,8 +2026,8 @@ impl BigWigWrite { Err(e) => Err(io::Error::new(io::ErrorKind::Other, format!("{}", e.0))), Ok(v) => Ok(v), }); - let chsi = BedParserStreamingIterator::wrap_iter(vals_iter_raw, true); - match bigwig.write(chrom_map, chsi, runtime) { + let data = BedParserStreamingIterator::wrap_iter(vals_iter_raw, true); + match bigwig.write(chrom_map, data, runtime) { Err(e) => println!("{}", e), Ok(_) => {} } @@ -2178,8 +2178,8 @@ impl BigBedWrite { Err(e) => Err(io::Error::new(io::ErrorKind::Other, format!("{}", e.0))), Ok(v) => Ok(v), }); - let chsi = BedParserStreamingIterator::wrap_iter(vals_iter_raw, true); - match bigbed.write(chrom_map, chsi, runtime) { + let data = BedParserStreamingIterator::wrap_iter(vals_iter_raw, true); + match bigbed.write(chrom_map, data, runtime) { Err(e) => { println!("{}", e) } From 62669315bcbc4d09f73f4bb1a9396574edde7c02 Mon Sep 17 00:00:00 2001 From: Jack Huey <31162821+jackh726@users.noreply.github.com> Date: Thu, 4 Jul 2024 01:23:00 -0400 Subject: [PATCH 28/31] Rename ProcessChromError to BBIProcessError and make do_process return new error that doesn't include SourceError --- bigtools/src/bbi/bbiwrite.rs | 73 +++++++++++++++++---------- bigtools/src/bbi/beddata.rs | 40 +++++++-------- bigtools/src/bbi/bigbedwrite.rs | 43 ++++++++-------- bigtools/src/bbi/bigwigwrite.rs | 31 ++++++------ bigtools/src/utils/cli/bigwigmerge.rs | 12 ++--- 5 files changed, 111 insertions(+), 88 deletions(-) diff --git a/bigtools/src/bbi/bbiwrite.rs b/bigtools/src/bbi/bbiwrite.rs index 60e34e2..32c76c9 100644 --- a/bigtools/src/bbi/bbiwrite.rs +++ b/bigtools/src/bbi/bbiwrite.rs @@ -107,7 +107,7 @@ impl Default for BBIWriteOptions { /// Possible errors encountered when processing a chromosome when writing a bbi file #[derive(Error, Debug)] -pub enum ProcessChromError { +pub enum BBIProcessError { #[error("{}", .0)] InvalidInput(String), #[error("{}", .0)] @@ -118,10 +118,20 @@ pub enum ProcessChromError { SourceError(SourceError), } +impl From for BBIProcessError { + fn from(value: ProcessDataError) -> Self { + match value { + ProcessDataError::InvalidInput(e) => BBIProcessError::InvalidInput(e), + ProcessDataError::InvalidChromosome(e) => BBIProcessError::InvalidChromosome(e), + ProcessDataError::IoError(e) => BBIProcessError::IoError(e), + } + } +} + pub(crate) struct TempZoomInfo { pub resolution: u32, pub data_write_future: - tokio::task::JoinHandle>>, + tokio::task::JoinHandle>>, pub data: TempFileBuffer>, pub sections: crossbeam_channel::Receiver
, } @@ -156,7 +166,7 @@ pub(crate) fn write_info( zoom_entries: Vec, summary: Summary, data_count: u64, -) -> Result<(), ProcessChromError> { +) -> Result<(), BBIProcessError> { file.seek(SeekFrom::Start(0))?; file.write_u32::(magic)?; file.write_u16::(4)?; @@ -556,14 +566,14 @@ pub trait BBIDataSource: Sized { fn process_to_bbi< P: BBIDataProcessor + Send + 'static, - StartProcessing: FnMut(String) -> Result>, - Advance: FnMut(P) -> Result<(), ProcessChromError>, + StartProcessing: FnMut(String) -> Result>, + Advance: FnMut(P) -> Result<(), BBIProcessError>, >( &mut self, runtime: &Runtime, start_processing: &mut StartProcessing, advance: &mut Advance, - ) -> Result<(), ProcessChromError>; + ) -> Result<(), BBIProcessError>; } // Zooms have to be double-buffered: first because chroms could be processed in parallel and second because we don't know the offset of each zoom immediately @@ -575,13 +585,13 @@ pub(crate) type ZoomValue = ( type Data = ( crossbeam_channel::Receiver
, TempFileBuffer>, - tokio::task::JoinHandle>>, + tokio::task::JoinHandle>>, Vec>, ); type DataWithoutzooms = ( crossbeam_channel::Receiver
, TempFileBuffer>, - tokio::task::JoinHandle>>, + tokio::task::JoinHandle>>, ); async fn write_chroms_with_zooms( @@ -595,7 +605,7 @@ async fn write_chroms_with_zooms( Vec>, BTreeMap, ), - ProcessChromError, + BBIProcessError, > { let mut section_iter = vec![]; let mut max_uncompressed_buf_size = 0; @@ -652,7 +662,7 @@ async fn write_chroms_without_zooms( usize, Vec>, ), - ProcessChromError, + BBIProcessError, > { let mut section_iter = vec![]; let mut max_uncompressed_buf_size = 0; @@ -694,13 +704,24 @@ pub(crate) mod process_internal { } } +/// Possible errors encountered when processing a value to a BBI file. +#[derive(Error, Debug)] +pub enum ProcessDataError { + #[error("{}", .0)] + InvalidInput(String), + #[error("{}", .0)] + InvalidChromosome(String), + #[error("{}", .0)] + IoError(#[from] io::Error), +} + pub trait BBIDataProcessor: process_internal::BBIDataProcessorCreate { type Value: Send + 'static; - fn do_process( + fn do_process( &mut self, current_val: Self::Value, next_val: Option<&Self::Value>, - ) -> impl Future>> + Send; + ) -> impl Future> + Send; } pub(crate) fn write_vals< @@ -726,7 +747,7 @@ pub(crate) fn write_vals< Vec, usize, ), - ProcessChromError, + BBIProcessError, > { let zooms_map: BTreeMap = std::iter::successors(Some(options.initial_zoom_size), |z| Some(z * 4)) @@ -751,7 +772,7 @@ pub(crate) fn write_vals< send: &mut futures_mpsc::UnboundedSender<( crossbeam_channel::Receiver
, TempFileBuffer>, - tokio::task::JoinHandle>>, + tokio::task::JoinHandle>>, Vec>, )>, options: BBIWriteOptions, @@ -790,11 +811,11 @@ pub(crate) fn write_vals< (zooms_channels, ftx) } - let mut do_read = |chrom: String| -> Result<_, ProcessChromError<_>> { + let mut do_read = |chrom: String| -> Result<_, BBIProcessError<_>> { let length = match chrom_sizes.get(&chrom) { Some(length) => *length, None => { - return Err(ProcessChromError::InvalidChromosome(format!( + return Err(BBIProcessError::InvalidChromosome(format!( "Input bedGraph contains chromosome that isn't in the input chrom sizes: {}", chrom ))); @@ -906,7 +927,7 @@ pub(crate) fn write_vals_no_zoom< Flatten>>, usize, ), - ProcessChromError, + BBIProcessError, > { let total_zoom_counts = std::iter::successors(Some(10), |z: &u64| Some((*z).saturating_mul(4))) .take_while(|z| *z < u64::MAX) @@ -929,11 +950,11 @@ pub(crate) fn write_vals_no_zoom< ftx }; - let mut do_read = |chrom: String| -> Result<_, ProcessChromError<_>> { + let mut do_read = |chrom: String| -> Result<_, BBIProcessError<_>> { let length = match chrom_sizes.get(&chrom) { Some(length) => *length, None => { - return Err(ProcessChromError::InvalidChromosome(format!( + return Err(BBIProcessError::InvalidChromosome(format!( "Input bedGraph contains chromosome that isn't in the input chrom sizes: {}", chrom ))); @@ -1017,7 +1038,7 @@ pub(crate) struct InternalTempZoomInfo { pub resolution: u32, pub data_write_future: - tokio::task::JoinHandle>>, + tokio::task::JoinHandle>>, pub data: TempFileBuffer>>, pub sections: crossbeam_channel::Receiver
, } @@ -1048,11 +1069,11 @@ pub(crate) fn write_zoom_vals< zoom_counts: BTreeMap, mut file: BufWriter, data_size: u64, -) -> Result<(BufWriter, Vec, usize), ProcessChromError> { +) -> Result<(BufWriter, Vec, usize), BBIProcessError> { let min_first_zoom_size = average_size.max(10) * 4; let mut zoom_receivers = vec![]; let mut zoom_files = vec![]; - let mut zooms_map: BTreeMap>> = zoom_counts + let mut zooms_map: BTreeMap>> = zoom_counts .into_iter() .skip_while(|z| z.0 > min_first_zoom_size as u64) .skip_while(|z| { @@ -1082,7 +1103,7 @@ pub(crate) fn write_zoom_vals< let mut max_uncompressed_buf_size = 0; - let mut do_read = |chrom: String| -> Result> { + let mut do_read = |chrom: String| -> Result> { // Make a new id for the chromosome let chrom_id = *chrom_ids .get(&chrom) @@ -1239,7 +1260,7 @@ pub(crate) fn write_mid( chrom_sizes: HashMap, chrom_ids: &HashMap, options: BBIWriteOptions, -) -> Result<(u64, u64, u64, u64), ProcessChromError> { +) -> Result<(u64, u64, u64, u64), BBIProcessError> { let data_size = file.tell()? - pre_data; let mut current_offset = pre_data; let sections_iter = raw_sections_iter.map(|mut section| { @@ -1276,7 +1297,7 @@ async fn write_data( mut data_file: W, section_sender: crossbeam_channel::Sender
, mut frx: futures_mpsc::Receiver>>, -) -> Result<(usize, usize), ProcessChromError> { +) -> Result<(usize, usize), BBIProcessError> { let mut current_offset = 0; let mut total = 0; let mut max_uncompressed_buf_size = 0; @@ -1306,7 +1327,7 @@ pub(crate) fn future_channel ( BBIDataProcessoringInputSectionChannel, - tokio::task::JoinHandle>>, + tokio::task::JoinHandle>>, TempFileBuffer, crossbeam_channel::Receiver
, ) { diff --git a/bigtools/src/bbi/beddata.rs b/bigtools/src/bbi/beddata.rs index d7ed779..2c9a66b 100644 --- a/bigtools/src/bbi/beddata.rs +++ b/bigtools/src/bbi/beddata.rs @@ -19,7 +19,7 @@ use crate::bed::bedparser::{ }; use crate::utils::file_view::FileView; use crate::utils::streaming_linereader::StreamingLineReader; -use crate::{BBIDataProcessor, BBIDataSource, BedEntry, ProcessChromError, Value}; +use crate::{BBIDataProcessor, BBIDataSource, BBIProcessError, BedEntry, Value}; pub struct BedParserStreamingIterator { bed_data: S, @@ -91,19 +91,19 @@ impl BBIDataSource for BedParserStreamingIterator { fn process_to_bbi< P: BBIDataProcessor, - StartProcessing: FnMut(String) -> Result>, - Advance: FnMut(P) -> Result<(), ProcessChromError>, + StartProcessing: FnMut(String) -> Result>, + Advance: FnMut(P) -> Result<(), BBIProcessError>, >( &mut self, runtime: &Runtime, start_processing: &mut StartProcessing, advance: &mut Advance, - ) -> Result<(), ProcessChromError> { + ) -> Result<(), BBIProcessError> { runtime.block_on(async move { let first_val = self.bed_data.next(); let (mut curr_state, mut next_val) = match first_val { // The first value is an error - Some(Err(e)) => return Err(ProcessChromError::SourceError(e)), + Some(Err(e)) => return Err(BBIProcessError::SourceError(e)), // There are no values at all None => return Ok(()), // The next value is the first @@ -112,7 +112,7 @@ impl BBIDataSource for BedParserStreamingIterator { let mut p = start_processing(chrom.clone())?; let next_val = self.bed_data.next(); let next_val = match next_val { - Some(Err(e)) => return Err(ProcessChromError::SourceError(e)), + Some(Err(e)) => return Err(BBIProcessError::SourceError(e)), Some(Ok(v)) => Some(v), None => None, }; @@ -135,7 +135,7 @@ impl BBIDataSource for BedParserStreamingIterator { ((curr_chrom, curr_state), Some((chrom, val))) if chrom == curr_chrom => { let next_val = self.bed_data.next(); let next_val = match next_val { - Some(Err(e)) => return Err(ProcessChromError::SourceError(e)), + Some(Err(e)) => return Err(BBIProcessError::SourceError(e)), Some(Ok(v)) => Some(v), None => None, }; @@ -151,7 +151,7 @@ impl BBIDataSource for BedParserStreamingIterator { let (prev_chrom, prev_state) = curr_state; // TODO: test this correctly fails if !self.allow_out_of_order_chroms && prev_chrom.as_str() >= chrom { - return Err(ProcessChromError::SourceError(BedValueError::InvalidInput("Input bedGraph not sorted by chromosome. Sort with `sort -k1,1 -k2,2n`.".to_string()))); + return Err(BBIProcessError::SourceError(BedValueError::InvalidInput("Input bedGraph not sorted by chromosome. Sort with `sort -k1,1 -k2,2n`.".to_string()))); } advance(prev_state)?; @@ -159,7 +159,7 @@ impl BBIDataSource for BedParserStreamingIterator { let mut p = start_processing(chrom.clone())?; let next_val = self.bed_data.next(); let next_val = match next_val { - Some(Err(e)) => return Err(ProcessChromError::SourceError(e)), + Some(Err(e)) => return Err(BBIProcessError::SourceError(e)), Some(Ok(v)) => Some(v), None => None, }; @@ -213,14 +213,14 @@ impl BBIDataSource for BedParserParallelStreamingIterator fn process_to_bbi< P: BBIDataProcessor + Send + 'static, - StartProcessing: FnMut(String) -> Result>, - Advance: FnMut(P) -> Result<(), ProcessChromError>, + StartProcessing: FnMut(String) -> Result>, + Advance: FnMut(P) -> Result<(), BBIProcessError>, >( &mut self, runtime: &Runtime, start_processing: &mut StartProcessing, advance: &mut Advance, - ) -> Result<(), ProcessChromError> { + ) -> Result<(), BBIProcessError> { let mut remaining = true; let mut queued_reads: VecDeque<_> = VecDeque::new(); loop { @@ -235,7 +235,7 @@ impl BBIDataSource for BedParserParallelStreamingIterator next.map(|n| assert!(curr.1 != n.1)); // TODO: test this correctly fails if !self.allow_out_of_order_chroms && next.map(|n| curr.1 > n.1).unwrap_or(false) { - return Err(ProcessChromError::SourceError(BedValueError::InvalidInput( + return Err(BBIProcessError::SourceError(BedValueError::InvalidInput( "Input bedGraph not sorted by chromosome. Sort with `sort -k1,1 -k2,2n`." .to_string(), ))); @@ -243,7 +243,7 @@ impl BBIDataSource for BedParserParallelStreamingIterator let file = match File::open(&self.path) { Ok(f) => f, - Err(err) => return Err(ProcessChromError::SourceError(err.into())), + Err(err) => return Err(BBIProcessError::SourceError(err.into())), }; let file = FileView::new(file, curr.0, next.map(|n| n.0).unwrap_or(u64::MAX))?; let mut stream = BedFileStream { @@ -253,7 +253,7 @@ impl BBIDataSource for BedParserParallelStreamingIterator let mut p = start_processing(curr.1.clone())?; let curr_chrom = curr.1.clone(); - let data: tokio::task::JoinHandle>> = + let data: tokio::task::JoinHandle>> = runtime.spawn(async move { let mut next_val: Option> = None; @@ -264,10 +264,10 @@ impl BBIDataSource for BedParserParallelStreamingIterator }; next_val = match curr_value { // The next value is an error - Some(Err(e)) => return Err(ProcessChromError::SourceError(e)), + Some(Err(e)) => return Err(BBIProcessError::SourceError(e)), None => return Ok(p), Some(Ok((chrom, _))) if chrom != curr_chrom => { - return Err(ProcessChromError::InvalidInput( + return Err(BBIProcessError::InvalidInput( "File is not sorted.".to_string(), )); } @@ -301,7 +301,7 @@ mod tests { use super::*; use crate::bed::bedparser::parse_bedgraph; use crate::process_internal::BBIDataProcessorCreate; - use crate::{ProcessChromError, Value}; + use crate::{ProcessDataError, Value}; use std::fs::File; use std::io; use std::path::PathBuf; @@ -336,11 +336,11 @@ mod tests { } impl BBIDataProcessor for TestBBIDataProcessor { type Value = Value; - async fn do_process( + async fn do_process( &mut self, _current_val: Self::Value, _next_val: Option<&Self::Value>, - ) -> Result<(), ProcessChromError> { + ) -> Result<(), ProcessDataError> { self.count += 1; Ok(()) diff --git a/bigtools/src/bbi/bigbedwrite.rs b/bigtools/src/bbi/bigbedwrite.rs index c3d6951..9661614 100644 --- a/bigtools/src/bbi/bigbedwrite.rs +++ b/bigtools/src/bbi/bigbedwrite.rs @@ -15,13 +15,14 @@ use crate::utils::tell::Tell; use crate::{ write_info, BBIDataProcessor, BBIDataProcessoredData, BBIDataProcessoringInputSectionChannel, BBIDataSource, InternalProcessData, InternalTempZoomInfo, NoZoomsInternalProcessData, - NoZoomsInternalProcessedData, ZoomsInternalProcessData, ZoomsInternalProcessedData, + NoZoomsInternalProcessedData, ProcessDataError, ZoomsInternalProcessData, + ZoomsInternalProcessedData, }; use crate::bbi::{BedEntry, Summary, Value, ZoomRecord, BIGBED_MAGIC}; use crate::bbiwrite::{ - self, encode_zoom_section, write_blank_headers, write_zooms, BBIWriteOptions, - ProcessChromError, SectionData, + self, encode_zoom_section, write_blank_headers, write_zooms, BBIProcessError, BBIWriteOptions, + SectionData, }; /// The struct used to write a bigBed file @@ -43,7 +44,7 @@ impl BigBedWrite { fn write_pre( file: &mut BufWriter, autosql: &Option, - ) -> Result<(u64, u64, u64, u64), ProcessChromError> { + ) -> Result<(u64, u64, u64, u64), BBIProcessError> { write_blank_headers(file)?; let autosql_offset = file.tell()?; @@ -51,7 +52,7 @@ impl BigBedWrite { .clone() .unwrap_or_else(|| crate::bed::autosql::BED3.to_string()); let autosql = CString::new(autosql.into_bytes()).map_err(|_| { - ProcessChromError::InvalidInput("Invalid autosql: null byte in string".to_owned()) + BBIProcessError::InvalidInput("Invalid autosql: null byte in string".to_owned()) })?; file.write_all(autosql.as_bytes_with_nul())?; @@ -85,7 +86,7 @@ impl BigBedWrite { chrom_sizes: HashMap, vals: V, runtime: Runtime, - ) -> Result<(), ProcessChromError> { + ) -> Result<(), BBIProcessError> { let fp = File::create(self.path.clone())?; let mut file = BufWriter::new(fp); @@ -142,10 +143,10 @@ impl BigBedWrite { /// high resolution zooms takes up a substantial portion of total processing time. pub fn write_multipass>( self, - make_vals: impl Fn() -> Result>, + make_vals: impl Fn() -> Result>, chrom_sizes: HashMap, runtime: Runtime, - ) -> Result<(), ProcessChromError> { + ) -> Result<(), BBIProcessError> { let fp = File::create(self.path.clone())?; let mut file = BufWriter::new(fp); @@ -211,7 +212,7 @@ impl BigBedWrite { Ok(()) } - async fn process_val( + async fn process_val( current_val: BedEntry, next_val: Option<&BedEntry>, chrom_length: u32, @@ -223,20 +224,20 @@ impl BigBedWrite { runtime: &Handle, ftx: &mut BBIDataProcessoringInputSectionChannel, chrom_id: u32, - ) -> Result<(), ProcessChromError> { + ) -> Result<(), ProcessDataError> { // Check a few preconditions: // - The current end is greater than or equal to the start // - The current end is at most the chromosome length // - If there is a next value, then it does not overlap value // TODO: test these correctly fails if current_val.start > current_val.end { - return Err(ProcessChromError::InvalidInput(format!( + return Err(ProcessDataError::InvalidInput(format!( "Invalid bed: {} > {}", current_val.start, current_val.end ))); } if current_val.start >= chrom_length { - return Err(ProcessChromError::InvalidInput(format!( + return Err(ProcessDataError::InvalidInput(format!( "Invalid bed: `{}` is greater than the chromosome ({}) length ({})", current_val.start, chrom, chrom_length ))); @@ -245,7 +246,7 @@ impl BigBedWrite { None => (), Some(next_val) => { if current_val.start > next_val.start { - return Err(ProcessChromError::InvalidInput(format!( + return Err(ProcessDataError::InvalidInput(format!( "Invalid bed: not sorted on chromosome {} at {}-{} (first) and {}-{} (second). Use sort -k1,1 -k2,2n to sort the bed before input.", chrom, current_val.start, @@ -370,7 +371,7 @@ impl BigBedWrite { Ok(()) } - async fn process_val_zoom( + async fn process_val_zoom( zoom_items: &mut Vec, options: BBIWriteOptions, item_start: u32, @@ -378,7 +379,7 @@ impl BigBedWrite { next_val: Option<&BedEntry>, runtime: &Handle, chrom_id: u32, - ) -> Result<(), ProcessChromError> { + ) -> Result<(), ProcessDataError> { // Then, add the item to the zoom item queues. This is a bit complicated. for zoom_item in zoom_items.iter_mut() { debug_assert_ne!(zoom_item.records.len(), options.items_per_slot as usize); @@ -614,11 +615,11 @@ impl BBIDataProcessorCreate for BigBedFullProcess { } impl BBIDataProcessor for BigBedFullProcess { type Value = BedEntry; - async fn do_process( + async fn do_process( &mut self, current_val: Self::Value, next_val: Option<&Self::Value>, - ) -> Result<(), ProcessChromError> { + ) -> Result<(), ProcessDataError> { let Self { summary, total_items, @@ -754,11 +755,11 @@ impl BBIDataProcessorCreate for BigBedNoZoomsProcess { impl BBIDataProcessor for BigBedNoZoomsProcess { type Value = BedEntry; - async fn do_process( + async fn do_process( &mut self, current_val: Self::Value, next_val: Option<&Self::Value>, - ) -> Result<(), ProcessChromError> { + ) -> Result<(), ProcessDataError> { let BigBedNoZoomsProcess { ftx, chrom_id, @@ -856,11 +857,11 @@ impl BBIDataProcessorCreate for BigBedZoomsProcess { } impl BBIDataProcessor for BigBedZoomsProcess { type Value = BedEntry; - async fn do_process( + async fn do_process( &mut self, current_val: Self::Value, next_val: Option<&Self::Value>, - ) -> Result<(), ProcessChromError> { + ) -> Result<(), ProcessDataError> { let BigBedZoomsProcess { chrom_id, options, diff --git a/bigtools/src/bbi/bigwigwrite.rs b/bigtools/src/bbi/bigwigwrite.rs index 8cb2063..06d056b 100644 --- a/bigtools/src/bbi/bigwigwrite.rs +++ b/bigtools/src/bbi/bigwigwrite.rs @@ -54,13 +54,14 @@ use crate::utils::tell::Tell; use crate::{ write_info, BBIDataProcessor, BBIDataProcessoredData, BBIDataProcessoringInputSectionChannel, BBIDataSource, InternalProcessData, InternalTempZoomInfo, NoZoomsInternalProcessData, - NoZoomsInternalProcessedData, ZoomsInternalProcessData, ZoomsInternalProcessedData, + NoZoomsInternalProcessedData, ProcessDataError, ZoomsInternalProcessData, + ZoomsInternalProcessedData, }; use crate::bbi::{Summary, Value, ZoomRecord, BIGWIG_MAGIC}; use crate::bbiwrite::{ - self, encode_zoom_section, write_blank_headers, write_zooms, BBIWriteOptions, - ProcessChromError, SectionData, + self, encode_zoom_section, write_blank_headers, write_zooms, BBIProcessError, BBIWriteOptions, + SectionData, }; struct ZoomItem { @@ -89,7 +90,7 @@ impl BigWigWrite { fn write_pre( file: &mut BufWriter, - ) -> Result<(u64, u64, u64), ProcessChromError> { + ) -> Result<(u64, u64, u64), BBIProcessError> { write_blank_headers(file)?; let total_summary_offset = file.tell()?; @@ -115,7 +116,7 @@ impl BigWigWrite { chrom_sizes: HashMap, vals: V, runtime: Runtime, - ) -> Result<(), ProcessChromError> { + ) -> Result<(), BBIProcessError> { let options = self.options; let fp = File::create(self.path.clone())?; let mut file = BufWriter::new(fp); @@ -177,10 +178,10 @@ impl BigWigWrite { /// high resolution zooms takes up a substantial portion of total processing time. pub fn write_multipass>( self, - make_vals: impl Fn() -> Result>, + make_vals: impl Fn() -> Result>, chrom_sizes: HashMap, runtime: Runtime, - ) -> Result<(), ProcessChromError> { + ) -> Result<(), BBIProcessError> { let fp = File::create(self.path.clone())?; let mut file = BufWriter::new(fp); @@ -393,9 +394,9 @@ impl BigWigWrite { struct BigWigInvalidInput(String); -impl From for ProcessChromError { +impl From for ProcessDataError { fn from(value: BigWigInvalidInput) -> Self { - ProcessChromError::InvalidInput(value.0) + ProcessDataError::InvalidInput(value.0) } } @@ -475,11 +476,11 @@ impl BBIDataProcessorCreate for BigWigFullProcess { impl BBIDataProcessor for BigWigFullProcess { type Value = Value; - async fn do_process( + async fn do_process( &mut self, current_val: Value, next_val: Option<&Value>, - ) -> Result<(), ProcessChromError> { + ) -> Result<(), ProcessDataError> { let Self { summary, items, @@ -606,11 +607,11 @@ impl BBIDataProcessorCreate for BigWigNoZoomsProcess { impl BBIDataProcessor for BigWigNoZoomsProcess { type Value = Value; - async fn do_process( + async fn do_process( &mut self, current_val: Self::Value, next_val: Option<&Self::Value>, - ) -> Result<(), ProcessChromError> { + ) -> Result<(), ProcessDataError> { let BigWigNoZoomsProcess { ftx, chrom_id, @@ -699,11 +700,11 @@ impl BBIDataProcessorCreate for BigWigZoomsProcess { } impl BBIDataProcessor for BigWigZoomsProcess { type Value = Value; - async fn do_process( + async fn do_process( &mut self, current_val: Self::Value, next_val: Option<&Self::Value>, - ) -> Result<(), ProcessChromError> { + ) -> Result<(), ProcessDataError> { let BigWigZoomsProcess { chrom_id, options, diff --git a/bigtools/src/utils/cli/bigwigmerge.rs b/bigtools/src/utils/cli/bigwigmerge.rs index d1f894e..b453be3 100644 --- a/bigtools/src/utils/cli/bigwigmerge.rs +++ b/bigtools/src/utils/cli/bigwigmerge.rs @@ -10,7 +10,7 @@ use thiserror::Error; use crate::utils::merge::merge_sections_many; use crate::utils::reopen::ReopenableFile; use crate::{BBIDataProcessor, BBIReadError, BigWigRead, BigWigWrite}; -use crate::{BBIDataSource, ProcessChromError, Value}; +use crate::{BBIDataSource, BBIProcessError, Value}; use tokio::runtime::{self, Runtime}; use super::BBIWriteArgs; @@ -351,14 +351,14 @@ impl BBIDataSource for ChromGroupReadImpl { fn process_to_bbi< P: BBIDataProcessor, - StartProcessing: FnMut(String) -> Result>, - Advance: FnMut(P) -> Result<(), ProcessChromError>, + StartProcessing: FnMut(String) -> Result>, + Advance: FnMut(P) -> Result<(), BBIProcessError>, >( &mut self, runtime: &Runtime, start_processing: &mut StartProcessing, advance: &mut Advance, - ) -> Result<(), ProcessChromError> { + ) -> Result<(), BBIProcessError> { loop { let next: Option> = self.iter.next(); @@ -369,7 +369,7 @@ impl BBIDataSource for ChromGroupReadImpl { loop { let current_val = match group.iter.next() { Some(Ok(v)) => v, - Some(Err(e)) => Err(ProcessChromError::SourceError(e))?, + Some(Err(e)) => Err(BBIProcessError::SourceError(e))?, None => break, }; let next_val = match group.iter.peek() { @@ -382,7 +382,7 @@ impl BBIDataSource for ChromGroupReadImpl { advance(p)?; } - Some(Err(e)) => return Err(ProcessChromError::SourceError(e)), + Some(Err(e)) => return Err(BBIProcessError::SourceError(e)), None => break, } } From d8b18783d3db1b388877ba749a098bb87bf391c0 Mon Sep 17 00:00:00 2001 From: Jack Huey <31162821+jackh726@users.noreply.github.com> Date: Thu, 4 Jul 2024 01:49:52 -0400 Subject: [PATCH 29/31] Use ProcessDataError instead of BBIProcessError basically everywhere --- bigtools/src/bbi/bbiwrite.rs | 86 +++++++++++++-------------- bigtools/src/bbi/beddata.rs | 17 +++--- bigtools/src/bbi/bigbedwrite.rs | 21 ++++--- bigtools/src/bbi/bigwigwrite.rs | 19 +++--- bigtools/src/utils/cli/bigwigmerge.rs | 8 +-- 5 files changed, 70 insertions(+), 81 deletions(-) diff --git a/bigtools/src/bbi/bbiwrite.rs b/bigtools/src/bbi/bbiwrite.rs index 32c76c9..3f14fe0 100644 --- a/bigtools/src/bbi/bbiwrite.rs +++ b/bigtools/src/bbi/bbiwrite.rs @@ -128,10 +128,9 @@ impl From for BBIProcessError { } } -pub(crate) struct TempZoomInfo { +pub(crate) struct TempZoomInfo { pub resolution: u32, - pub data_write_future: - tokio::task::JoinHandle>>, + pub data_write_future: tokio::task::JoinHandle>, pub data: TempFileBuffer>, pub sections: crossbeam_channel::Receiver
, } @@ -151,7 +150,7 @@ pub(crate) fn write_blank_headers(file: &mut BufWriter) -> io::Result<()> Ok(()) } -pub(crate) fn write_info( +pub(crate) fn write_info( file: &mut BufWriter, magic: u32, num_zooms: u16, @@ -166,7 +165,7 @@ pub(crate) fn write_info( zoom_entries: Vec, summary: Summary, data_count: u64, -) -> Result<(), BBIProcessError> { +) -> Result<(), ProcessDataError> { file.seek(SeekFrom::Start(0))?; file.write_u32::(magic)?; file.write_u16::(4)?; @@ -566,8 +565,8 @@ pub trait BBIDataSource: Sized { fn process_to_bbi< P: BBIDataProcessor + Send + 'static, - StartProcessing: FnMut(String) -> Result>, - Advance: FnMut(P) -> Result<(), BBIProcessError>, + StartProcessing: FnMut(String) -> Result, + Advance: FnMut(P), >( &mut self, runtime: &Runtime, @@ -582,22 +581,22 @@ pub(crate) type ZoomValue = ( TempFileBuffer, Option>, ); -type Data = ( +type Data = ( crossbeam_channel::Receiver
, TempFileBuffer>, - tokio::task::JoinHandle>>, - Vec>, + tokio::task::JoinHandle>, + Vec, ); -type DataWithoutzooms = ( +type DataWithoutzooms = ( crossbeam_channel::Receiver
, TempFileBuffer>, - tokio::task::JoinHandle>>, + tokio::task::JoinHandle>, ); -async fn write_chroms_with_zooms( +async fn write_chroms_with_zooms( mut file: BufWriter, mut zooms_map: BTreeMap, - mut receiver: futures_mpsc::UnboundedReceiver>, + mut receiver: futures_mpsc::UnboundedReceiver, ) -> Result< ( BufWriter, @@ -605,7 +604,7 @@ async fn write_chroms_with_zooms( Vec>, BTreeMap, ), - BBIProcessError, + ProcessDataError, > { let mut section_iter = vec![]; let mut max_uncompressed_buf_size = 0; @@ -653,16 +652,16 @@ async fn write_chroms_with_zooms( Ok((file, max_uncompressed_buf_size, section_iter, zooms_map)) } -async fn write_chroms_without_zooms( +async fn write_chroms_without_zooms( mut file: BufWriter, - mut receiver: futures_mpsc::UnboundedReceiver>, + mut receiver: futures_mpsc::UnboundedReceiver, ) -> Result< ( BufWriter, usize, Vec>, ), - BBIProcessError, + ProcessDataError, > { let mut section_iter = vec![]; let mut max_uncompressed_buf_size = 0; @@ -768,12 +767,12 @@ pub(crate) fn write_vals< let write_fut = write_chroms_with_zooms(file, zooms_map, recv); let write_fut_handle = runtime.spawn(write_fut); - fn setup_chrom( + fn setup_chrom( send: &mut futures_mpsc::UnboundedSender<( crossbeam_channel::Receiver
, TempFileBuffer>, - tokio::task::JoinHandle>>, - Vec>, + tokio::task::JoinHandle>, + Vec, )>, options: BBIWriteOptions, runtime: &Runtime, @@ -811,11 +810,11 @@ pub(crate) fn write_vals< (zooms_channels, ftx) } - let mut do_read = |chrom: String| -> Result<_, BBIProcessError<_>> { + let mut do_read = |chrom: String| -> Result<_, ProcessDataError> { let length = match chrom_sizes.get(&chrom) { Some(length) => *length, None => { - return Err(BBIProcessError::InvalidChromosome(format!( + return Err(ProcessDataError::InvalidChromosome(format!( "Input bedGraph contains chromosome that isn't in the input chrom sizes: {}", chrom ))); @@ -852,7 +851,6 @@ pub(crate) fn write_vals< summary.sum_squares += chrom_summary.sum_squares; } } - Ok(()) }; vals_iter.process_to_bbi(&runtime, &mut do_read, &mut advance)?; @@ -938,7 +936,7 @@ pub(crate) fn write_vals_no_zoom< let mut summary: Option = None; let (send, recv) = futures_mpsc::unbounded(); - let write_fut = write_chroms_without_zooms::(file, recv); + let write_fut = write_chroms_without_zooms(file, recv); let write_fut_handle = runtime.spawn(write_fut); let setup_chrom = || { @@ -950,11 +948,11 @@ pub(crate) fn write_vals_no_zoom< ftx }; - let mut do_read = |chrom: String| -> Result<_, BBIProcessError<_>> { + let mut do_read = |chrom: String| -> Result<_, ProcessDataError> { let length = match chrom_sizes.get(&chrom) { Some(length) => *length, None => { - return Err(BBIProcessError::InvalidChromosome(format!( + return Err(ProcessDataError::InvalidChromosome(format!( "Input bedGraph contains chromosome that isn't in the input chrom sizes: {}", chrom ))); @@ -997,7 +995,6 @@ pub(crate) fn write_vals_no_zoom< let chrom_zoom_count = zoom_count_map.get(&zoom_count.0).copied().unwrap_or(1); *zoom_count.1 += chrom_zoom_count; } - Ok(()) }; vals_iter.process_to_bbi(&runtime, &mut do_read, &mut advance)?; @@ -1034,30 +1031,29 @@ type ZoomSender = futures_mpsc::Sender<( crossbeam_channel::Receiver
, )>; -pub(crate) struct InternalTempZoomInfo { +pub(crate) struct InternalTempZoomInfo { pub resolution: u32, - pub data_write_future: - tokio::task::JoinHandle>>, + pub data_write_future: tokio::task::JoinHandle>, pub data: TempFileBuffer>>, pub sections: crossbeam_channel::Receiver
, } -pub(crate) struct ZoomsInternalProcessData( - pub(crate) Vec>, +pub(crate) struct ZoomsInternalProcessData( + pub(crate) Vec, pub(crate) Vec<(u32, BBIDataProcessoringInputSectionChannel)>, pub(crate) u32, pub(crate) BBIWriteOptions, pub(crate) Handle, ); -pub(crate) struct ZoomsInternalProcessedData(pub(crate) Vec>); +pub(crate) struct ZoomsInternalProcessedData(pub(crate) Vec); pub(crate) fn write_zoom_vals< V: BBIDataSource, P: BBIDataProcessor + process_internal::BBIDataProcessorCreate< - I = ZoomsInternalProcessData, - Out = ZoomsInternalProcessedData, + I = ZoomsInternalProcessData, + Out = ZoomsInternalProcessedData, > + Send + 'static, >( @@ -1073,7 +1069,7 @@ pub(crate) fn write_zoom_vals< let min_first_zoom_size = average_size.max(10) * 4; let mut zoom_receivers = vec![]; let mut zoom_files = vec![]; - let mut zooms_map: BTreeMap>> = zoom_counts + let mut zooms_map: BTreeMap> = zoom_counts .into_iter() .skip_while(|z| z.0 > min_first_zoom_size as u64) .skip_while(|z| { @@ -1103,7 +1099,7 @@ pub(crate) fn write_zoom_vals< let mut max_uncompressed_buf_size = 0; - let mut do_read = |chrom: String| -> Result> { + let mut do_read = |chrom: String| -> Result { // Make a new id for the chromosome let chrom_id = *chrom_ids .get(&chrom) @@ -1153,8 +1149,6 @@ pub(crate) fn write_zoom_vals< let zoom = zooms_map.get_mut(&resolution).unwrap(); zoom.try_send((data_write_future, data, sections)).unwrap(); } - - Ok(()) }; let mut zooms = Vec::with_capacity(zoom_receivers.len()); @@ -1253,14 +1247,14 @@ pub(crate) fn write_zoom_vals< Ok((file, zoom_entries, max_uncompressed_buf_size)) } -pub(crate) fn write_mid( +pub(crate) fn write_mid( file: &mut BufWriter, pre_data: u64, raw_sections_iter: impl Iterator, chrom_sizes: HashMap, chrom_ids: &HashMap, options: BBIWriteOptions, -) -> Result<(u64, u64, u64, u64), BBIProcessError> { +) -> Result<(u64, u64, u64, u64), ProcessDataError> { let data_size = file.tell()? - pre_data; let mut current_offset = pre_data; let sections_iter = raw_sections_iter.map(|mut section| { @@ -1293,11 +1287,11 @@ pub(crate) fn write_mid( Ok((data_size, chrom_index_start, index_start, total_sections)) } -async fn write_data( +async fn write_data( mut data_file: W, section_sender: crossbeam_channel::Sender
, mut frx: futures_mpsc::Receiver>>, -) -> Result<(usize, usize), BBIProcessError> { +) -> Result<(usize, usize), ProcessDataError> { let mut current_offset = 0; let mut total = 0; let mut max_uncompressed_buf_size = 0; @@ -1321,13 +1315,13 @@ async fn write_data( Ok((total, max_uncompressed_buf_size)) } -pub(crate) fn future_channel( +pub(crate) fn future_channel( channel_size: usize, runtime: &Handle, inmemory: bool, ) -> ( BBIDataProcessoringInputSectionChannel, - tokio::task::JoinHandle>>, + tokio::task::JoinHandle>, TempFileBuffer, crossbeam_channel::Receiver
, ) { diff --git a/bigtools/src/bbi/beddata.rs b/bigtools/src/bbi/beddata.rs index 2c9a66b..53c2ba8 100644 --- a/bigtools/src/bbi/beddata.rs +++ b/bigtools/src/bbi/beddata.rs @@ -19,7 +19,7 @@ use crate::bed::bedparser::{ }; use crate::utils::file_view::FileView; use crate::utils::streaming_linereader::StreamingLineReader; -use crate::{BBIDataProcessor, BBIDataSource, BBIProcessError, BedEntry, Value}; +use crate::{BBIDataProcessor, BBIDataSource, BBIProcessError, BedEntry, ProcessDataError, Value}; pub struct BedParserStreamingIterator { bed_data: S, @@ -91,8 +91,8 @@ impl BBIDataSource for BedParserStreamingIterator { fn process_to_bbi< P: BBIDataProcessor, - StartProcessing: FnMut(String) -> Result>, - Advance: FnMut(P) -> Result<(), BBIProcessError>, + StartProcessing: FnMut(String) -> Result, + Advance: FnMut(P), >( &mut self, runtime: &Runtime, @@ -128,7 +128,7 @@ impl BBIDataSource for BedParserStreamingIterator { next_val = match (&mut curr_state, next_val) { // There are no more values ((_, _), None) => { - advance(curr_state.1)?; + advance(curr_state.1); return Ok(()); } // The next value is the same chromosome @@ -153,7 +153,7 @@ impl BBIDataSource for BedParserStreamingIterator { if !self.allow_out_of_order_chroms && prev_chrom.as_str() >= chrom { return Err(BBIProcessError::SourceError(BedValueError::InvalidInput("Input bedGraph not sorted by chromosome. Sort with `sort -k1,1 -k2,2n`.".to_string()))); } - advance(prev_state)?; + advance(prev_state); let chrom = chrom.to_string(); let mut p = start_processing(chrom.clone())?; @@ -213,8 +213,8 @@ impl BBIDataSource for BedParserParallelStreamingIterator fn process_to_bbi< P: BBIDataProcessor + Send + 'static, - StartProcessing: FnMut(String) -> Result>, - Advance: FnMut(P) -> Result<(), BBIProcessError>, + StartProcessing: FnMut(String) -> Result, + Advance: FnMut(P), >( &mut self, runtime: &Runtime, @@ -289,7 +289,7 @@ impl BBIDataSource for BedParserParallelStreamingIterator break; }; let p = runtime.block_on(next_chrom).unwrap()?; - advance(p)?; + advance(p); } Ok(()) @@ -350,7 +350,6 @@ mod tests { let mut advance = |p: TestBBIDataProcessor| { counts.push(p.count); let _ = p.destroy(); - Ok(()) }; data.process_to_bbi(&runtime, &mut start_processing, &mut advance) .unwrap(); diff --git a/bigtools/src/bbi/bigbedwrite.rs b/bigtools/src/bbi/bigbedwrite.rs index 9661614..4244b64 100644 --- a/bigtools/src/bbi/bigbedwrite.rs +++ b/bigtools/src/bbi/bigbedwrite.rs @@ -1,5 +1,4 @@ use std::collections::HashMap; -use std::error::Error; use std::ffi::CString; use std::fs::File; use std::io::{self, BufWriter, Write}; @@ -41,10 +40,10 @@ impl BigBedWrite { } } - fn write_pre( + fn write_pre( file: &mut BufWriter, autosql: &Option, - ) -> Result<(u64, u64, u64, u64), BBIProcessError> { + ) -> Result<(u64, u64, u64, u64), ProcessDataError> { write_blank_headers(file)?; let autosql_offset = file.tell()?; @@ -52,7 +51,7 @@ impl BigBedWrite { .clone() .unwrap_or_else(|| crate::bed::autosql::BED3.to_string()); let autosql = CString::new(autosql.into_bytes()).map_err(|_| { - BBIProcessError::InvalidInput("Invalid autosql: null byte in string".to_owned()) + ProcessDataError::InvalidInput("Invalid autosql: null byte in string".to_owned()) })?; file.write_all(autosql.as_bytes_with_nul())?; @@ -177,7 +176,7 @@ impl BigBedWrite { let vals = make_vals()?; - let output = bbiwrite::write_zoom_vals::<_, BigBedZoomsProcess<_>>( + let output = bbiwrite::write_zoom_vals::<_, BigBedZoomsProcess>( vals, self.options, &runtime, @@ -809,8 +808,8 @@ impl BBIDataProcessor for BigBedNoZoomsProcess { } } -struct BigBedZoomsProcess { - temp_zoom_items: Vec>, +struct BigBedZoomsProcess { + temp_zoom_items: Vec, chrom_id: u32, options: BBIWriteOptions, runtime: Handle, @@ -818,9 +817,9 @@ struct BigBedZoomsProcess { zoom_items: Vec, } -impl BBIDataProcessorCreate for BigBedZoomsProcess { - type I = ZoomsInternalProcessData; - type Out = ZoomsInternalProcessedData; +impl BBIDataProcessorCreate for BigBedZoomsProcess { + type I = ZoomsInternalProcessData; + type Out = ZoomsInternalProcessedData; fn create(internal_data: Self::I) -> Self { let ZoomsInternalProcessData(temp_zoom_items, zooms_channels, chrom_id, options, runtime) = internal_data; @@ -855,7 +854,7 @@ impl BBIDataProcessorCreate for BigBedZoomsProcess { ZoomsInternalProcessedData(self.temp_zoom_items) } } -impl BBIDataProcessor for BigBedZoomsProcess { +impl BBIDataProcessor for BigBedZoomsProcess { type Value = BedEntry; async fn do_process( &mut self, diff --git a/bigtools/src/bbi/bigwigwrite.rs b/bigtools/src/bbi/bigwigwrite.rs index 06d056b..7a63842 100644 --- a/bigtools/src/bbi/bigwigwrite.rs +++ b/bigtools/src/bbi/bigwigwrite.rs @@ -39,7 +39,6 @@ out.write(chrom_map, vals, runtime)?; ``` */ use std::collections::HashMap; -use std::error::Error; use std::fs::File; use std::io::{self, BufWriter, Write}; use std::vec; @@ -88,9 +87,7 @@ impl BigWigWrite { } } - fn write_pre( - file: &mut BufWriter, - ) -> Result<(u64, u64, u64), BBIProcessError> { + fn write_pre(file: &mut BufWriter) -> Result<(u64, u64, u64), ProcessDataError> { write_blank_headers(file)?; let total_summary_offset = file.tell()?; @@ -211,7 +208,7 @@ impl BigWigWrite { let vals = make_vals()?; - let output = bbiwrite::write_zoom_vals::<_, BigWigZoomsProcess<_>>( + let output = bbiwrite::write_zoom_vals::<_, BigWigZoomsProcess>( vals, self.options, &runtime, @@ -653,8 +650,8 @@ impl BBIDataProcessor for BigWigNoZoomsProcess { } } -struct BigWigZoomsProcess { - temp_zoom_items: Vec>, +struct BigWigZoomsProcess { + temp_zoom_items: Vec, chrom_id: u32, options: BBIWriteOptions, runtime: Handle, @@ -662,9 +659,9 @@ struct BigWigZoomsProcess { zoom_items: Vec, } -impl BBIDataProcessorCreate for BigWigZoomsProcess { - type I = ZoomsInternalProcessData; - type Out = ZoomsInternalProcessedData; +impl BBIDataProcessorCreate for BigWigZoomsProcess { + type I = ZoomsInternalProcessData; + type Out = ZoomsInternalProcessedData; fn create(internal_data: Self::I) -> Self { let ZoomsInternalProcessData(temp_zoom_items, zooms_channels, chrom_id, options, runtime) = internal_data; @@ -698,7 +695,7 @@ impl BBIDataProcessorCreate for BigWigZoomsProcess { ZoomsInternalProcessedData(self.temp_zoom_items) } } -impl BBIDataProcessor for BigWigZoomsProcess { +impl BBIDataProcessor for BigWigZoomsProcess { type Value = Value; async fn do_process( &mut self, diff --git a/bigtools/src/utils/cli/bigwigmerge.rs b/bigtools/src/utils/cli/bigwigmerge.rs index b453be3..1ee5871 100644 --- a/bigtools/src/utils/cli/bigwigmerge.rs +++ b/bigtools/src/utils/cli/bigwigmerge.rs @@ -9,7 +9,7 @@ use thiserror::Error; use crate::utils::merge::merge_sections_many; use crate::utils::reopen::ReopenableFile; -use crate::{BBIDataProcessor, BBIReadError, BigWigRead, BigWigWrite}; +use crate::{BBIDataProcessor, BBIReadError, BigWigRead, BigWigWrite, ProcessDataError}; use crate::{BBIDataSource, BBIProcessError, Value}; use tokio::runtime::{self, Runtime}; @@ -351,8 +351,8 @@ impl BBIDataSource for ChromGroupReadImpl { fn process_to_bbi< P: BBIDataProcessor, - StartProcessing: FnMut(String) -> Result>, - Advance: FnMut(P) -> Result<(), BBIProcessError>, + StartProcessing: FnMut(String) -> Result, + Advance: FnMut(P), >( &mut self, runtime: &Runtime, @@ -380,7 +380,7 @@ impl BBIDataSource for ChromGroupReadImpl { runtime.block_on(read)?; } - advance(p)?; + advance(p); } Some(Err(e)) => return Err(BBIProcessError::SourceError(e)), None => break, From 60838088241fe899bd137472d8672355c970d7f3 Mon Sep 17 00:00:00 2001 From: Jack Huey <31162821+jackh726@users.noreply.github.com> Date: Thu, 4 Jul 2024 13:31:41 -0400 Subject: [PATCH 30/31] One pub -> pub(crate) --- bigtools/src/bbi/bbiwrite.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigtools/src/bbi/bbiwrite.rs b/bigtools/src/bbi/bbiwrite.rs index 3f14fe0..dfd376b 100644 --- a/bigtools/src/bbi/bbiwrite.rs +++ b/bigtools/src/bbi/bbiwrite.rs @@ -684,7 +684,7 @@ async fn write_chroms_without_zooms( Ok((file, max_uncompressed_buf_size, section_iter)) } -pub struct InternalProcessData( +pub(crate) struct InternalProcessData( pub(crate) Vec<(u32, BBIDataProcessoringInputSectionChannel)>, pub(crate) BBIDataProcessoringInputSectionChannel, pub(crate) u32, From e446a6ef4c9c0de6897e3307cfe2f36120791329 Mon Sep 17 00:00:00 2001 From: Jack Huey <31162821+jackh726@users.noreply.github.com> Date: Thu, 4 Jul 2024 20:17:15 -0400 Subject: [PATCH 31/31] Make create_file and open_file take . Abstract BigWigWrite and BigBedWrite over W: Write+Seek. Take chrom_sizes in create_file instead of write*. --- bigtools/src/bbi/bbiread.rs | 4 +- bigtools/src/bbi/bbiwrite.rs | 91 +-- bigtools/src/bbi/bigbedread.rs | 9 +- bigtools/src/bbi/bigbedwrite.rs | 555 +++++++++--------- bigtools/src/bbi/bigwigread.rs | 14 +- bigtools/src/bbi/bigwigwrite.rs | 342 +++++------ bigtools/src/utils/cli/bedgraphtobigwig.rs | 21 +- bigtools/src/utils/cli/bedtobigbed.rs | 18 +- .../src/utils/cli/bigwigaverageoverbed.rs | 4 +- bigtools/src/utils/cli/bigwigmerge.rs | 6 +- bigtools/src/utils/file/reopen.rs | 3 +- bigtools/tests/bigbedwrite.rs | 17 +- bigtools/tests/bigwigread.rs | 6 +- bigtools/tests/bigwigwrite.rs | 27 +- pybigtools/src/lib.rs | 44 +- 15 files changed, 602 insertions(+), 559 deletions(-) diff --git a/bigtools/src/bbi/bbiread.rs b/bigtools/src/bbi/bbiread.rs index 0a44f7d..211fc6b 100644 --- a/bigtools/src/bbi/bbiread.rs +++ b/bigtools/src/bbi/bbiread.rs @@ -501,8 +501,8 @@ impl GenericBBIRead { /// Opens a generic bbi file pub fn open_file(path: &str) -> Result { let reopen = ReopenableFile { - path: path.to_string(), - file: File::open(path)?, + file: File::open(&path)?, + path: path.into(), }; let b = GenericBBIRead::open(reopen); if b.is_err() { diff --git a/bigtools/src/bbi/bbiwrite.rs b/bigtools/src/bbi/bbiwrite.rs index dfd376b..624d097 100644 --- a/bigtools/src/bbi/bbiwrite.rs +++ b/bigtools/src/bbi/bbiwrite.rs @@ -140,7 +140,9 @@ pub(crate) type BBIDataProcessoringInputSectionChannel = const MAX_ZOOM_LEVELS: usize = 10; -pub(crate) fn write_blank_headers(file: &mut BufWriter) -> io::Result<()> { +pub(crate) fn write_blank_headers( + file: &mut BufWriter, +) -> io::Result<()> { file.seek(SeekFrom::Start(0))?; // Common header file.write_all(&[0; 64])?; @@ -150,8 +152,8 @@ pub(crate) fn write_blank_headers(file: &mut BufWriter) -> io::Result<()> Ok(()) } -pub(crate) fn write_info( - file: &mut BufWriter, +pub(crate) fn write_info( + file: &mut BufWriter, magic: u32, num_zooms: u16, chrom_index_start: u64, @@ -205,8 +207,8 @@ pub(crate) fn write_info( Ok(()) } -pub(crate) fn write_chrom_tree( - file: &mut BufWriter, +pub(crate) fn write_chrom_tree( + file: &mut BufWriter, chrom_sizes: std::collections::HashMap, chrom_ids: &std::collections::HashMap, ) -> io::Result<()> { @@ -493,8 +495,8 @@ pub(crate) fn write_rtreeindex( Ok(()) } -pub(crate) fn write_zooms( - mut file: &mut BufWriter, +pub(crate) fn write_zooms( + mut file: &mut BufWriter, zooms: Vec, data_size: u64, options: BBIWriteOptions, @@ -528,7 +530,7 @@ pub(crate) fn write_zooms( let zoom_index_offset = file.tell()?; //println!("Zoom {:?}, data: {:?}, offset {:?}", zoom.resolution, zoom_data_offset, zoom_index_offset); assert_eq!(zoom_index_offset - zoom_data_offset, zoom_size); - write_rtreeindex(&mut file, nodes, levels, total_sections, options)?; + write_rtreeindex(file, nodes, levels, total_sections, options)?; zoom_entries.push(ZoomHeader { reduction_level: zoom.resolution, @@ -581,25 +583,25 @@ pub(crate) type ZoomValue = ( TempFileBuffer, Option>, ); -type Data = ( +type Data = ( crossbeam_channel::Receiver
, - TempFileBuffer>, + TempFileBuffer>, tokio::task::JoinHandle>, Vec, ); -type DataWithoutzooms = ( +type DataWithoutzooms = ( crossbeam_channel::Receiver
, - TempFileBuffer>, + TempFileBuffer>, tokio::task::JoinHandle>, ); -async fn write_chroms_with_zooms( - mut file: BufWriter, +async fn write_chroms_with_zooms( + mut file: BufWriter, mut zooms_map: BTreeMap, - mut receiver: futures_mpsc::UnboundedReceiver, + mut receiver: futures_mpsc::UnboundedReceiver>, ) -> Result< ( - BufWriter, + BufWriter, usize, Vec>, BTreeMap, @@ -652,12 +654,12 @@ async fn write_chroms_with_zooms( Ok((file, max_uncompressed_buf_size, section_iter, zooms_map)) } -async fn write_chroms_without_zooms( - mut file: BufWriter, - mut receiver: futures_mpsc::UnboundedReceiver, +async fn write_chroms_without_zooms( + mut file: BufWriter, + mut receiver: futures_mpsc::UnboundedReceiver>, ) -> Result< ( - BufWriter, + BufWriter, usize, Vec>, ), @@ -724,6 +726,7 @@ pub trait BBIDataProcessor: process_internal::BBIDataProcessorCreate { } pub(crate) fn write_vals< + W: Write + Seek + Send + 'static, V: BBIDataSource, P: BBIDataProcessor + process_internal::BBIDataProcessorCreate< @@ -733,15 +736,15 @@ pub(crate) fn write_vals< + 'static, >( mut vals_iter: V, - file: BufWriter, + file: BufWriter, options: BBIWriteOptions, runtime: Runtime, - chrom_sizes: HashMap, + chrom_sizes: &HashMap, ) -> Result< ( IdMap, Summary, - BufWriter, + BufWriter, Flatten>>, Vec, usize, @@ -767,10 +770,10 @@ pub(crate) fn write_vals< let write_fut = write_chroms_with_zooms(file, zooms_map, recv); let write_fut_handle = runtime.spawn(write_fut); - fn setup_chrom( + fn setup_chrom( send: &mut futures_mpsc::UnboundedSender<( crossbeam_channel::Receiver
, - TempFileBuffer>, + TempFileBuffer>, tokio::task::JoinHandle>, Vec, )>, @@ -903,6 +906,7 @@ pub(crate) struct NoZoomsInternalProcessData( pub(crate) struct NoZoomsInternalProcessedData(pub(crate) Summary, pub(crate) Vec<(u64, u64)>); pub(crate) fn write_vals_no_zoom< + W: Write + Seek + Send + 'static, V: BBIDataSource, P: BBIDataProcessor + process_internal::BBIDataProcessorCreate< @@ -912,16 +916,16 @@ pub(crate) fn write_vals_no_zoom< + 'static, >( mut vals_iter: V, - file: BufWriter, + file: BufWriter, options: BBIWriteOptions, runtime: &Runtime, - chrom_sizes: HashMap, + chrom_sizes: &HashMap, ) -> Result< ( IdMap, Summary, BTreeMap, - BufWriter, + BufWriter, Flatten>>, usize, ), @@ -1025,35 +1029,38 @@ pub(crate) fn write_vals_no_zoom< } // Zooms have to be double-buffered: first because chroms could be processed in parallel and second because we don't know the offset of each zoom immediately -type ZoomSender = futures_mpsc::Sender<( +type ZoomSender = futures_mpsc::Sender<( tokio::task::JoinHandle>, - TempFileBuffer>>, + TempFileBuffer>>, crossbeam_channel::Receiver
, )>; -pub(crate) struct InternalTempZoomInfo { +pub(crate) struct InternalTempZoomInfo { pub resolution: u32, pub data_write_future: tokio::task::JoinHandle>, - pub data: TempFileBuffer>>, + pub data: TempFileBuffer>>, pub sections: crossbeam_channel::Receiver
, } -pub(crate) struct ZoomsInternalProcessData( - pub(crate) Vec, +pub(crate) struct ZoomsInternalProcessData( + pub(crate) Vec>, pub(crate) Vec<(u32, BBIDataProcessoringInputSectionChannel)>, pub(crate) u32, pub(crate) BBIWriteOptions, pub(crate) Handle, ); -pub(crate) struct ZoomsInternalProcessedData(pub(crate) Vec); +pub(crate) struct ZoomsInternalProcessedData( + pub(crate) Vec>, +); pub(crate) fn write_zoom_vals< + W: Write + Seek + Send + 'static, V: BBIDataSource, P: BBIDataProcessor + process_internal::BBIDataProcessorCreate< - I = ZoomsInternalProcessData, - Out = ZoomsInternalProcessedData, + I = ZoomsInternalProcessData, + Out = ZoomsInternalProcessedData, > + Send + 'static, >( @@ -1063,13 +1070,13 @@ pub(crate) fn write_zoom_vals< chrom_ids: &HashMap, average_size: u32, zoom_counts: BTreeMap, - mut file: BufWriter, + mut file: BufWriter, data_size: u64, -) -> Result<(BufWriter, Vec, usize), BBIProcessError> { +) -> Result<(BufWriter, Vec, usize), BBIProcessError> { let min_first_zoom_size = average_size.max(10) * 4; let mut zoom_receivers = vec![]; let mut zoom_files = vec![]; - let mut zooms_map: BTreeMap> = zoom_counts + let mut zooms_map: BTreeMap> = zoom_counts .into_iter() .skip_while(|z| z.0 > min_first_zoom_size as u64) .skip_while(|z| { @@ -1247,8 +1254,8 @@ pub(crate) fn write_zoom_vals< Ok((file, zoom_entries, max_uncompressed_buf_size)) } -pub(crate) fn write_mid( - file: &mut BufWriter, +pub(crate) fn write_mid( + file: &mut BufWriter, pre_data: u64, raw_sections_iter: impl Iterator, chrom_sizes: HashMap, diff --git a/bigtools/src/bbi/bigbedread.rs b/bigtools/src/bbi/bigbedread.rs index b26b24a..6784b20 100644 --- a/bigtools/src/bbi/bigbedread.rs +++ b/bigtools/src/bbi/bigbedread.rs @@ -1,6 +1,7 @@ use std::borrow::BorrowMut; use std::fs::File; use std::io::{self, BufRead, BufReader, Seek, SeekFrom}; +use std::path::Path; use std::vec::Vec; use byteorder::{BigEndian, LittleEndian, ReadBytesExt}; @@ -149,14 +150,14 @@ impl BigBedRead { impl BigBedRead { /// Opens a new `BigBedRead` from a given path as a file. - pub fn open_file(path: &str) -> Result { + pub fn open_file(path: impl AsRef) -> Result { let reopen = ReopenableFile { - path: path.to_string(), - file: File::open(path)?, + file: File::open(&path)?, + path: path.as_ref().to_owned(), }; let b = BigBedRead::open(reopen); if b.is_err() { - eprintln!("Error when opening: {}", path); + eprintln!("Error when opening: {:?}", path.as_ref()); } b } diff --git a/bigtools/src/bbi/bigbedwrite.rs b/bigtools/src/bbi/bigbedwrite.rs index 4244b64..cfb00e0 100644 --- a/bigtools/src/bbi/bigbedwrite.rs +++ b/bigtools/src/bbi/bigbedwrite.rs @@ -1,7 +1,8 @@ use std::collections::HashMap; use std::ffi::CString; use std::fs::File; -use std::io::{self, BufWriter, Write}; +use std::io::{self, BufWriter, Seek, Write}; +use std::path::Path; use futures::sink::SinkExt; @@ -25,23 +26,35 @@ use crate::bbiwrite::{ }; /// The struct used to write a bigBed file -pub struct BigBedWrite { - pub path: String, +pub struct BigBedWrite { + out: W, + chrom_sizes: HashMap, pub options: BBIWriteOptions, pub autosql: Option, } -impl BigBedWrite { - pub fn create_file(path: String) -> Self { +impl BigBedWrite { + pub fn create_file( + path: impl AsRef, + chrom_sizes: HashMap, + ) -> io::Result { + let out = File::create(path)?; + Ok(BigBedWrite::new(out, chrom_sizes)) + } +} + +impl BigBedWrite { + pub fn new(out: W, chrom_sizes: HashMap) -> Self { BigBedWrite { - path, + out, + chrom_sizes, options: BBIWriteOptions::default(), autosql: None, } } fn write_pre( - file: &mut BufWriter, + file: &mut BufWriter, autosql: &Option, ) -> Result<(u64, u64, u64, u64), ProcessDataError> { write_blank_headers(file)?; @@ -82,22 +95,20 @@ impl BigBedWrite { /// Write the values from `V` as a bigWig. Will utilize the provided runtime for encoding values and for reading through the values (potentially parallelized by chromosome). pub fn write>( self, - chrom_sizes: HashMap, vals: V, runtime: Runtime, ) -> Result<(), BBIProcessError> { - let fp = File::create(self.path.clone())?; - let mut file = BufWriter::new(fp); + let mut file = BufWriter::new(self.out); let (autosql_offset, total_summary_offset, full_data_offset, pre_data) = BigBedWrite::write_pre(&mut file, &self.autosql)?; - let output = bbiwrite::write_vals::<_, BigBedFullProcess>( + let output = bbiwrite::write_vals::<_, _, BigBedFullProcess>( vals, file, self.options, runtime, - chrom_sizes.clone(), + &self.chrom_sizes, ); let (chrom_ids, summary, mut file, raw_sections_iter, zoom_infos, uncompress_buf_size) = output?; @@ -107,7 +118,7 @@ impl BigBedWrite { &mut file, pre_data, raw_sections_iter, - chrom_sizes, + self.chrom_sizes, &chrom_ids, self.options, )?; @@ -143,23 +154,21 @@ impl BigBedWrite { pub fn write_multipass>( self, make_vals: impl Fn() -> Result>, - chrom_sizes: HashMap, runtime: Runtime, ) -> Result<(), BBIProcessError> { - let fp = File::create(self.path.clone())?; - let mut file = BufWriter::new(fp); + let mut file = BufWriter::new(self.out); let (autosql_offset, total_summary_offset, full_data_offset, pre_data) = BigBedWrite::write_pre(&mut file, &self.autosql)?; let vals = make_vals()?; - let output = bbiwrite::write_vals_no_zoom::<_, BigBedNoZoomsProcess>( + let output = bbiwrite::write_vals_no_zoom::<_, _, BigBedNoZoomsProcess>( vals, file, self.options, &runtime, - chrom_sizes.clone(), + &self.chrom_sizes, ); let (chrom_ids, summary, zoom_counts, mut file, raw_sections_iter, mut uncompress_buf_size) = output?; @@ -169,14 +178,14 @@ impl BigBedWrite { &mut file, pre_data, raw_sections_iter, - chrom_sizes, + self.chrom_sizes, &chrom_ids, self.options, )?; let vals = make_vals()?; - let output = bbiwrite::write_zoom_vals::<_, BigBedZoomsProcess>( + let output = bbiwrite::write_zoom_vals::<_, _, BigBedZoomsProcess>( vals, self.options, &runtime, @@ -210,180 +219,69 @@ impl BigBedWrite { Ok(()) } +} - async fn process_val( - current_val: BedEntry, - next_val: Option<&BedEntry>, - chrom_length: u32, - chrom: &String, - summary: &mut Option, - items: &mut Vec, - overlap: &mut IndexList, - options: BBIWriteOptions, - runtime: &Handle, - ftx: &mut BBIDataProcessoringInputSectionChannel, - chrom_id: u32, - ) -> Result<(), ProcessDataError> { - // Check a few preconditions: - // - The current end is greater than or equal to the start - // - The current end is at most the chromosome length - // - If there is a next value, then it does not overlap value - // TODO: test these correctly fails - if current_val.start > current_val.end { - return Err(ProcessDataError::InvalidInput(format!( - "Invalid bed: {} > {}", - current_val.start, current_val.end - ))); - } - if current_val.start >= chrom_length { - return Err(ProcessDataError::InvalidInput(format!( - "Invalid bed: `{}` is greater than the chromosome ({}) length ({})", - current_val.start, chrom, chrom_length - ))); - } - match next_val { - None => (), - Some(next_val) => { - if current_val.start > next_val.start { - return Err(ProcessDataError::InvalidInput(format!( - "Invalid bed: not sorted on chromosome {} at {}-{} (first) and {}-{} (second). Use sort -k1,1 -k2,2n to sort the bed before input.", - chrom, - current_val.start, - current_val.end, - next_val.start, - next_val.end, - ))); - } +async fn process_val( + current_val: BedEntry, + next_val: Option<&BedEntry>, + chrom_length: u32, + chrom: &String, + summary: &mut Option, + items: &mut Vec, + overlap: &mut IndexList, + options: BBIWriteOptions, + runtime: &Handle, + ftx: &mut BBIDataProcessoringInputSectionChannel, + chrom_id: u32, +) -> Result<(), ProcessDataError> { + // Check a few preconditions: + // - The current end is greater than or equal to the start + // - The current end is at most the chromosome length + // - If there is a next value, then it does not overlap value + // TODO: test these correctly fails + if current_val.start > current_val.end { + return Err(ProcessDataError::InvalidInput(format!( + "Invalid bed: {} > {}", + current_val.start, current_val.end + ))); + } + if current_val.start >= chrom_length { + return Err(ProcessDataError::InvalidInput(format!( + "Invalid bed: `{}` is greater than the chromosome ({}) length ({})", + current_val.start, chrom, chrom_length + ))); + } + match next_val { + None => (), + Some(next_val) => { + if current_val.start > next_val.start { + return Err(ProcessDataError::InvalidInput(format!( + "Invalid bed: not sorted on chromosome {} at {}-{} (first) and {}-{} (second). Use sort -k1,1 -k2,2n to sort the bed before input.", + chrom, + current_val.start, + current_val.end, + next_val.start, + next_val.end, + ))); } } - - // Now, actually process the value. - - // First, update the summary. - let add_interval_to_summary = - move |overlap: &mut IndexList, - summary: &mut Option, - item_start: u32, - item_end: u32, - next_start_opt: Option| { - // If any overlaps exists, it must be starting at the current start (else it would have to be after the current entry) - // If the overlap starts before, the entry wasn't correctly cut last iteration - debug_assert!(overlap - .head() - .map(|f| f.start == item_start) - .unwrap_or(true)); - - // For each item in `overlap` that overlaps the current - // item, add `1` to the value. - let mut index = overlap.head_index(); - while let Some(i) = index { - match overlap.get_mut(i) { - None => break, - Some(o) => { - o.value += 1.0; - if item_end < o.end { - let value = o.value - 1.0; - let end = o.end; - o.end = item_end; - overlap.insert_after( - i, - Value { - start: item_end, - end, - value, - }, - ); - break; - } - index = overlap.next_index(i); - } - } - } - - debug_assert!(overlap.tail().map(|o| o.end >= item_start).unwrap_or(true)); - - if overlap.tail().map(|o| o.end).unwrap_or(item_start) == item_start { - overlap.push_back(Value { - start: item_start, - end: item_end, - value: 1.0, - }); - } - - let next_start = next_start_opt.unwrap_or(u32::max_value()); - - while overlap - .head() - .map(|f| f.start < next_start) - .unwrap_or(false) - { - let mut removed = overlap.pop_front().unwrap(); - let (len, val) = if removed.end <= next_start { - (removed.end - removed.start, f64::from(removed.value)) - } else { - let len = next_start - removed.start; - let val = f64::from(removed.value); - removed.start = next_start; - overlap.push_front(removed); - (len, val) - }; - - match summary { - None => { - *summary = Some(Summary { - total_items: 0, - bases_covered: u64::from(len), - min_val: val, - max_val: val, - sum: f64::from(len) * val, - sum_squares: f64::from(len) * val * val, - }) - } - Some(summary) => { - summary.bases_covered += u64::from(len); - summary.min_val = summary.min_val.min(val); - summary.max_val = summary.max_val.max(val); - summary.sum += f64::from(len) * val; - summary.sum_squares += f64::from(len) * val * val; - } - } - } - }; - - add_interval_to_summary( - overlap, - summary, - current_val.start, - current_val.end, - next_val.map(|v| v.start), - ); - - // Then, add the current item to the actual values, and encode if full, or last item - items.push(current_val); - if next_val.is_none() || items.len() >= options.items_per_slot as usize { - let items = - std::mem::replace(items, Vec::with_capacity(options.items_per_slot as usize)); - let handle = runtime.spawn(encode_section(options.compress, items, chrom_id)); - ftx.send(handle).await.expect("Couldn't send"); - } - - Ok(()) } - async fn process_val_zoom( - zoom_items: &mut Vec, - options: BBIWriteOptions, - item_start: u32, - item_end: u32, - next_val: Option<&BedEntry>, - runtime: &Handle, - chrom_id: u32, - ) -> Result<(), ProcessDataError> { - // Then, add the item to the zoom item queues. This is a bit complicated. - for zoom_item in zoom_items.iter_mut() { - debug_assert_ne!(zoom_item.records.len(), options.items_per_slot as usize); - - let overlap = &mut zoom_item.overlap; + // Now, actually process the value. + + // First, update the summary. + let add_interval_to_summary = + move |overlap: &mut IndexList, + summary: &mut Option, + item_start: u32, + item_end: u32, + next_start_opt: Option| { + // If any overlaps exists, it must be starting at the current start (else it would have to be after the current entry) + // If the overlap starts before, the entry wasn't correctly cut last iteration + debug_assert!(overlap + .head() + .map(|f| f.start == item_start) + .unwrap_or(true)); // For each item in `overlap` that overlaps the current // item, add `1` to the value. @@ -422,7 +320,7 @@ impl BigBedWrite { }); } - let next_start = next_val.map(|v| v.start).unwrap_or(u32::max_value()); + let next_start = next_start_opt.unwrap_or(u32::max_value()); while overlap .head() @@ -430,95 +328,206 @@ impl BigBedWrite { .unwrap_or(false) { let mut removed = overlap.pop_front().unwrap(); - let val = f64::from(removed.value); - let (removed_start, removed_end) = if removed.end <= next_start { - (removed.start, removed.end) + let (len, val) = if removed.end <= next_start { + (removed.end - removed.start, f64::from(removed.value)) } else { - let start = removed.start; + let len = next_start - removed.start; + let val = f64::from(removed.value); removed.start = next_start; overlap.push_front(removed); - (start, next_start) + (len, val) }; - let mut add_start = removed_start; - loop { - if add_start >= removed_end { - if next_val.is_none() { - if let Some((mut zoom2, total_items)) = zoom_item.live_info.take() { - zoom2.summary.total_items = total_items; - zoom_item.records.push(zoom2); - } - if !zoom_item.records.is_empty() { - let items = std::mem::take(&mut zoom_item.records); - let handle = - runtime.spawn(encode_zoom_section(options.compress, items)); - zoom_item.channel.send(handle).await.expect("Couln't send"); - } - } - break; + match summary { + None => { + *summary = Some(Summary { + total_items: 0, + bases_covered: u64::from(len), + min_val: val, + max_val: val, + sum: f64::from(len) * val, + sum_squares: f64::from(len) * val * val, + }) } - let (zoom2, _) = zoom_item.live_info.get_or_insert(( - ZoomRecord { - chrom: chrom_id, - start: add_start, - end: add_start, - summary: Summary { - total_items: 0, - bases_covered: 0, - min_val: 1.0, - max_val: 1.0, - sum: 0.0, - sum_squares: 0.0, - }, - }, - 0, - )); - // The end of zoom record - let next_end = zoom2.start + zoom_item.size; - // End of bases that we could add - let add_end = std::cmp::min(next_end, removed_end); - // If the last zoom ends before this value starts, we don't add anything - if add_end >= add_start { - let added_bases = add_end - add_start; - zoom2.end = add_end; - zoom2.summary.total_items += 1; // XXX - zoom2.summary.bases_covered += u64::from(added_bases); - zoom2.summary.min_val = zoom2.summary.min_val.min(val); - zoom2.summary.max_val = zoom2.summary.max_val.max(val); - zoom2.summary.sum += f64::from(added_bases) * val; - zoom2.summary.sum_squares += f64::from(added_bases) * val * val; + Some(summary) => { + summary.bases_covered += u64::from(len); + summary.min_val = summary.min_val.min(val); + summary.max_val = summary.max_val.max(val); + summary.sum += f64::from(len) * val; + summary.sum_squares += f64::from(len) * val * val; } - // If we made it to the end of the zoom (whether it was because the zoom ended before this value started, - // or we added to the end of the zoom), then write this zooms to the current section - if add_end == next_end { - zoom_item.records.push( - zoom_item - .live_info - .take() - .map(|(mut zoom_item, total_items)| { - zoom_item.summary.total_items = total_items; - zoom_item - }) - .unwrap(), + } + } + }; + + add_interval_to_summary( + overlap, + summary, + current_val.start, + current_val.end, + next_val.map(|v| v.start), + ); + + // Then, add the current item to the actual values, and encode if full, or last item + items.push(current_val); + if next_val.is_none() || items.len() >= options.items_per_slot as usize { + let items = std::mem::replace(items, Vec::with_capacity(options.items_per_slot as usize)); + let handle = runtime.spawn(encode_section(options.compress, items, chrom_id)); + ftx.send(handle).await.expect("Couldn't send"); + } + + Ok(()) +} + +async fn process_val_zoom( + zoom_items: &mut Vec, + options: BBIWriteOptions, + item_start: u32, + item_end: u32, + next_val: Option<&BedEntry>, + runtime: &Handle, + chrom_id: u32, +) -> Result<(), ProcessDataError> { + // Then, add the item to the zoom item queues. This is a bit complicated. + for zoom_item in zoom_items.iter_mut() { + debug_assert_ne!(zoom_item.records.len(), options.items_per_slot as usize); + + let overlap = &mut zoom_item.overlap; + + // For each item in `overlap` that overlaps the current + // item, add `1` to the value. + let mut index = overlap.head_index(); + while let Some(i) = index { + match overlap.get_mut(i) { + None => break, + Some(o) => { + o.value += 1.0; + if item_end < o.end { + let value = o.value - 1.0; + let end = o.end; + o.end = item_end; + overlap.insert_after( + i, + Value { + start: item_end, + end, + value, + }, ); + break; } - // Set where we would start for next time - add_start = std::cmp::max(add_end, removed_start); - // Write section if full - if zoom_item.records.len() == options.items_per_slot as usize { - let items = std::mem::take(&mut zoom_item.records); - let handle = runtime.spawn(encode_zoom_section(options.compress, items)); - zoom_item.channel.send(handle).await.expect("Couln't send"); - } + index = overlap.next_index(i); } } + } + + debug_assert!(overlap.tail().map(|o| o.end >= item_start).unwrap_or(true)); - debug_assert_ne!(zoom_item.records.len(), options.items_per_slot as usize); + if overlap.tail().map(|o| o.end).unwrap_or(item_start) == item_start { + overlap.push_back(Value { + start: item_start, + end: item_end, + value: 1.0, + }); } - Ok(()) + let next_start = next_val.map(|v| v.start).unwrap_or(u32::max_value()); + + while overlap + .head() + .map(|f| f.start < next_start) + .unwrap_or(false) + { + let mut removed = overlap.pop_front().unwrap(); + let val = f64::from(removed.value); + let (removed_start, removed_end) = if removed.end <= next_start { + (removed.start, removed.end) + } else { + let start = removed.start; + removed.start = next_start; + overlap.push_front(removed); + (start, next_start) + }; + + let mut add_start = removed_start; + loop { + if add_start >= removed_end { + if next_val.is_none() { + if let Some((mut zoom2, total_items)) = zoom_item.live_info.take() { + zoom2.summary.total_items = total_items; + zoom_item.records.push(zoom2); + } + if !zoom_item.records.is_empty() { + let items = std::mem::take(&mut zoom_item.records); + let handle = + runtime.spawn(encode_zoom_section(options.compress, items)); + zoom_item.channel.send(handle).await.expect("Couln't send"); + } + } + break; + } + let (zoom2, _) = zoom_item.live_info.get_or_insert(( + ZoomRecord { + chrom: chrom_id, + start: add_start, + end: add_start, + summary: Summary { + total_items: 0, + bases_covered: 0, + min_val: 1.0, + max_val: 1.0, + sum: 0.0, + sum_squares: 0.0, + }, + }, + 0, + )); + // The end of zoom record + let next_end = zoom2.start + zoom_item.size; + // End of bases that we could add + let add_end = std::cmp::min(next_end, removed_end); + // If the last zoom ends before this value starts, we don't add anything + if add_end >= add_start { + let added_bases = add_end - add_start; + zoom2.end = add_end; + zoom2.summary.total_items += 1; // XXX + zoom2.summary.bases_covered += u64::from(added_bases); + zoom2.summary.min_val = zoom2.summary.min_val.min(val); + zoom2.summary.max_val = zoom2.summary.max_val.max(val); + zoom2.summary.sum += f64::from(added_bases) * val; + zoom2.summary.sum_squares += f64::from(added_bases) * val * val; + } + // If we made it to the end of the zoom (whether it was because the zoom ended before this value started, + // or we added to the end of the zoom), then write this zooms to the current section + if add_end == next_end { + zoom_item.records.push( + zoom_item + .live_info + .take() + .map(|(mut zoom_item, total_items)| { + zoom_item.summary.total_items = total_items; + zoom_item + }) + .unwrap(), + ); + } + // Set where we would start for next time + add_start = std::cmp::max(add_end, removed_start); + // Write section if full + if zoom_item.records.len() == options.items_per_slot as usize { + let items = std::mem::take(&mut zoom_item.records); + let handle = runtime.spawn(encode_zoom_section(options.compress, items)); + zoom_item.channel.send(handle).await.expect("Couln't send"); + } + } + } + + debug_assert_ne!(zoom_item.records.len(), options.items_per_slot as usize); } + + Ok(()) } + // While we do technically lose precision here by using the f32 in Value, we can reuse the same merge_into method struct ZoomItem { size: u32, @@ -638,7 +647,7 @@ impl BBIDataProcessor for BigBedFullProcess { let item_start = current_val.start; let item_end = current_val.end; - BigBedWrite::process_val( + process_val( current_val, next_val, length, @@ -653,7 +662,7 @@ impl BBIDataProcessor for BigBedFullProcess { ) .await?; - BigBedWrite::process_val_zoom( + process_val_zoom( &mut state_val.zoom_items, *options, item_start, @@ -778,7 +787,7 @@ impl BBIDataProcessor for BigBedNoZoomsProcess { let item_start = current_val.start; let item_end = current_val.end; - BigBedWrite::process_val( + process_val( current_val, next_val, *length, @@ -808,8 +817,8 @@ impl BBIDataProcessor for BigBedNoZoomsProcess { } } -struct BigBedZoomsProcess { - temp_zoom_items: Vec, +struct BigBedZoomsProcess { + temp_zoom_items: Vec>, chrom_id: u32, options: BBIWriteOptions, runtime: Handle, @@ -817,9 +826,9 @@ struct BigBedZoomsProcess { zoom_items: Vec, } -impl BBIDataProcessorCreate for BigBedZoomsProcess { - type I = ZoomsInternalProcessData; - type Out = ZoomsInternalProcessedData; +impl BBIDataProcessorCreate for BigBedZoomsProcess { + type I = ZoomsInternalProcessData; + type Out = ZoomsInternalProcessedData; fn create(internal_data: Self::I) -> Self { let ZoomsInternalProcessData(temp_zoom_items, zooms_channels, chrom_id, options, runtime) = internal_data; @@ -854,7 +863,7 @@ impl BBIDataProcessorCreate for BigBedZoomsProcess { ZoomsInternalProcessedData(self.temp_zoom_items) } } -impl BBIDataProcessor for BigBedZoomsProcess { +impl BBIDataProcessor for BigBedZoomsProcess { type Value = BedEntry; async fn do_process( &mut self, @@ -869,7 +878,7 @@ impl BBIDataProcessor for BigBedZoomsProcess { .. } = self; - BigBedWrite::process_val_zoom( + process_val_zoom( zoom_items, *options, current_val.start, diff --git a/bigtools/src/bbi/bigwigread.rs b/bigtools/src/bbi/bigwigread.rs index b5b7bb1..bed1f36 100644 --- a/bigtools/src/bbi/bigwigread.rs +++ b/bigtools/src/bbi/bigwigread.rs @@ -12,9 +12,8 @@ Provides the interface for reading bigWig files. # dir.push("resources/test"); # let mut bigwig = dir.clone(); # bigwig.push("valid.bigWig"); -# let bigwig = bigwig.to_string_lossy(); -// First, we open a bigWig using a file name (as a `&str`). -let mut bwread = BigWigRead::open_file(&bigwig)?; +// First, we open a bigWig using a file name. +let mut bwread = BigWigRead::open_file(bigwig)?; // Then, we could get the chromosomes and lengths let chroms = bwread.chroms(); @@ -42,6 +41,7 @@ assert_eq!(first_interval.value, 0.06792); use std::borrow::BorrowMut; use std::fs::File; use std::io::{self, Seek, SeekFrom}; +use std::path::Path; use std::vec::Vec; use byteordered::{ByteOrdered, Endianness}; @@ -195,14 +195,14 @@ impl BigWigRead { impl BigWigRead { /// Opens a new `BigWigRead` from a given path as a file. - pub fn open_file(path: &str) -> Result { + pub fn open_file(path: impl AsRef) -> Result { let reopen = ReopenableFile { - path: path.to_string(), - file: File::open(path)?, + file: File::open(&path)?, + path: path.as_ref().to_owned(), }; let b = BigWigRead::open(reopen); if b.is_err() { - eprintln!("Error when opening: {}", path); + eprintln!("Error when opening: {:?}", path.as_ref()); } b } diff --git a/bigtools/src/bbi/bigwigwrite.rs b/bigtools/src/bbi/bigwigwrite.rs index 7a63842..209cf1c 100644 --- a/bigtools/src/bbi/bigwigwrite.rs +++ b/bigtools/src/bbi/bigwigwrite.rs @@ -31,16 +31,17 @@ let runtime = tokio::runtime::Builder::new_multi_thread() // Finally, we can create a `BigWigWrite` with a file to write to. We'll use a temporary file. let tempfile = tempfile::NamedTempFile::new()?; -let out = BigWigWrite::create_file(tempfile.path().to_string_lossy().to_string()); +let out = BigWigWrite::create_file(tempfile.path(), chrom_map).unwrap(); // Then write. -out.write(chrom_map, vals, runtime)?; +out.write(vals, runtime)?; # Ok(()) # } ``` */ use std::collections::HashMap; use std::fs::File; -use std::io::{self, BufWriter, Write}; +use std::io::{self, BufWriter, Seek, Write}; +use std::path::Path; use std::vec; use futures::sink::SinkExt; @@ -74,20 +75,32 @@ struct ZoomItem { } /// The struct used to write a bigWig file -pub struct BigWigWrite { - pub path: String, +pub struct BigWigWrite { + out: W, + chrom_sizes: HashMap, pub options: BBIWriteOptions, } -impl BigWigWrite { - pub fn create_file(path: String) -> Self { +impl BigWigWrite { + pub fn create_file( + path: impl AsRef, + chrom_sizes: HashMap, + ) -> io::Result { + let out = File::create(path)?; + Ok(BigWigWrite::new(out, chrom_sizes)) + } +} + +impl BigWigWrite { + pub fn new(out: W, chrom_sizes: HashMap) -> Self { BigWigWrite { - path, + out, + chrom_sizes, options: BBIWriteOptions::default(), } } - fn write_pre(file: &mut BufWriter) -> Result<(u64, u64, u64), ProcessDataError> { + fn write_pre(file: &mut BufWriter) -> Result<(u64, u64, u64), ProcessDataError> { write_blank_headers(file)?; let total_summary_offset = file.tell()?; @@ -110,22 +123,20 @@ impl BigWigWrite { /// Write the values from `V` as a bigWig. Will utilize the provided runtime for encoding values and for reading through the values (potentially parallelized by chromosome). pub fn write>( self, - chrom_sizes: HashMap, vals: V, runtime: Runtime, ) -> Result<(), BBIProcessError> { let options = self.options; - let fp = File::create(self.path.clone())?; - let mut file = BufWriter::new(fp); + let mut file = BufWriter::new(self.out); let (total_summary_offset, full_data_offset, pre_data) = BigWigWrite::write_pre(&mut file)?; - let output = bbiwrite::write_vals::<_, BigWigFullProcess>( + let output = bbiwrite::write_vals::<_, _, BigWigFullProcess>( vals, file, options, runtime, - chrom_sizes.clone(), + &self.chrom_sizes, )?; let ( @@ -142,7 +153,7 @@ impl BigWigWrite { &mut file, pre_data, raw_sections_iter, - chrom_sizes, + self.chrom_sizes, &chrom_ids, self.options, )?; @@ -176,22 +187,20 @@ impl BigWigWrite { pub fn write_multipass>( self, make_vals: impl Fn() -> Result>, - chrom_sizes: HashMap, runtime: Runtime, ) -> Result<(), BBIProcessError> { - let fp = File::create(self.path.clone())?; - let mut file = BufWriter::new(fp); + let mut file = BufWriter::new(self.out); let (total_summary_offset, full_data_offset, pre_data) = BigWigWrite::write_pre(&mut file)?; let vals = make_vals()?; - let output = bbiwrite::write_vals_no_zoom::<_, BigWigNoZoomsProcess>( + let output = bbiwrite::write_vals_no_zoom::<_, _, BigWigNoZoomsProcess>( vals, file, self.options, &runtime, - chrom_sizes.clone(), + &self.chrom_sizes, ); let (chrom_ids, summary, zoom_counts, mut file, raw_sections_iter, mut uncompress_buf_size) = output?; @@ -201,14 +210,14 @@ impl BigWigWrite { &mut file, pre_data, raw_sections_iter, - chrom_sizes, + self.chrom_sizes, &chrom_ids, self.options, )?; let vals = make_vals()?; - let output = bbiwrite::write_zoom_vals::<_, BigWigZoomsProcess>( + let output = bbiwrite::write_zoom_vals::<_, _, BigWigZoomsProcess>( vals, self.options, &runtime, @@ -241,151 +250,150 @@ impl BigWigWrite { Ok(()) } +} - async fn process_val( - current_val: Value, - next_val: Option<&Value>, - chrom_length: u32, - chrom: &String, - summary: &mut Summary, - items: &mut Vec, - options: BBIWriteOptions, - runtime: &Handle, - ftx: &mut BBIDataProcessoringInputSectionChannel, - chrom_id: u32, - ) -> Result<(), BigWigInvalidInput> { - // Check a few preconditions: - // - The current end is greater than or equal to the start - // - The current end is at most the chromosome length - // - If there is a next value, then it does not overlap value - // TODO: test these correctly fails - if current_val.start > current_val.end { - return Err(BigWigInvalidInput(format!( - "Invalid bed graph: {} > {}", - current_val.start, current_val.end - ))); - } - if current_val.end > chrom_length { - return Err(BigWigInvalidInput(format!( - "Invalid bed graph: `{}` is greater than the chromosome ({}) length ({})", - current_val.end, chrom, chrom_length - ))); - } - match next_val { - None => {} - Some(next_val) => { - if current_val.end > next_val.start { - return Err(BigWigInvalidInput(format!( - "Invalid bed graph: overlapping values on chromosome {} at {}-{} and {}-{}", - chrom, current_val.start, current_val.end, next_val.start, next_val.end, - ))); - } +async fn process_val( + current_val: Value, + next_val: Option<&Value>, + chrom_length: u32, + chrom: &String, + summary: &mut Summary, + items: &mut Vec, + options: BBIWriteOptions, + runtime: &Handle, + ftx: &mut BBIDataProcessoringInputSectionChannel, + chrom_id: u32, +) -> Result<(), BigWigInvalidInput> { + // Check a few preconditions: + // - The current end is greater than or equal to the start + // - The current end is at most the chromosome length + // - If there is a next value, then it does not overlap value + // TODO: test these correctly fails + if current_val.start > current_val.end { + return Err(BigWigInvalidInput(format!( + "Invalid bed graph: {} > {}", + current_val.start, current_val.end + ))); + } + if current_val.end > chrom_length { + return Err(BigWigInvalidInput(format!( + "Invalid bed graph: `{}` is greater than the chromosome ({}) length ({})", + current_val.end, chrom, chrom_length + ))); + } + match next_val { + None => {} + Some(next_val) => { + if current_val.end > next_val.start { + return Err(BigWigInvalidInput(format!( + "Invalid bed graph: overlapping values on chromosome {} at {}-{} and {}-{}", + chrom, current_val.start, current_val.end, next_val.start, next_val.end, + ))); } } + } - // Now, actually process the value. - - // First, update the summary. - let len = current_val.end - current_val.start; - let val = f64::from(current_val.value); - summary.total_items += 1; - summary.bases_covered += u64::from(len); - summary.min_val = summary.min_val.min(val); - summary.max_val = summary.max_val.max(val); - summary.sum += f64::from(len) * val; - summary.sum_squares += f64::from(len) * val * val; - - // Then, add the current item to the actual values, and encode if full, or last item - items.push(current_val); - if next_val.is_none() || items.len() >= options.items_per_slot as usize { - let items = - std::mem::replace(items, Vec::with_capacity(options.items_per_slot as usize)); - let handle: tokio::task::JoinHandle> = - runtime.spawn(encode_section(options.compress, items, chrom_id)); - ftx.send(handle).await.expect("Couldn't send"); - } - - Ok(()) + // Now, actually process the value. + + // First, update the summary. + let len = current_val.end - current_val.start; + let val = f64::from(current_val.value); + summary.total_items += 1; + summary.bases_covered += u64::from(len); + summary.min_val = summary.min_val.min(val); + summary.max_val = summary.max_val.max(val); + summary.sum += f64::from(len) * val; + summary.sum_squares += f64::from(len) * val * val; + + // Then, add the current item to the actual values, and encode if full, or last item + items.push(current_val); + if next_val.is_none() || items.len() >= options.items_per_slot as usize { + let items = std::mem::replace(items, Vec::with_capacity(options.items_per_slot as usize)); + let handle: tokio::task::JoinHandle> = + runtime.spawn(encode_section(options.compress, items, chrom_id)); + ftx.send(handle).await.expect("Couldn't send"); } - async fn process_val_zoom( - zoom_items: &mut Vec, - options: BBIWriteOptions, - current_val: Value, - next_val: Option<&Value>, - runtime: &Handle, - chrom_id: u32, - ) { - // Then, add the item to the zoom item queues. This is a bit complicated. - for zoom_item in zoom_items.iter_mut() { - debug_assert_ne!(zoom_item.records.len(), options.items_per_slot as usize); - - // Zooms are comprised of a tiled set of summaries. Each summary spans a fixed length. - // Zoom summaries are compressed similarly to main data, with a given items per slot. - // It may be the case that our value spans across multiple zoom summaries, so this inner loop handles that. - - // `add_start` indicates where we are *currently* adding bases from (either the start of this item or in the middle, but beginning of another zoom section) - let mut add_start = current_val.start; - loop { - // Write section if full; or if no next section, some items, and no current zoom record - if (add_start >= current_val.end - && zoom_item.live_info.is_none() - && next_val.is_none() - && !zoom_item.records.is_empty()) - || zoom_item.records.len() == options.items_per_slot as usize - { - let items = std::mem::take(&mut zoom_item.records); - let handle = runtime.spawn(encode_zoom_section(options.compress, items)); - zoom_item.channel.send(handle).await.expect("Couln't send"); - } - if add_start >= current_val.end { - if next_val.is_none() { - if let Some(zoom2) = zoom_item.live_info.take() { - zoom_item.records.push(zoom2); - continue; - } + Ok(()) +} + +async fn process_val_zoom( + zoom_items: &mut Vec, + options: BBIWriteOptions, + current_val: Value, + next_val: Option<&Value>, + runtime: &Handle, + chrom_id: u32, +) { + // Then, add the item to the zoom item queues. This is a bit complicated. + for zoom_item in zoom_items.iter_mut() { + debug_assert_ne!(zoom_item.records.len(), options.items_per_slot as usize); + + // Zooms are comprised of a tiled set of summaries. Each summary spans a fixed length. + // Zoom summaries are compressed similarly to main data, with a given items per slot. + // It may be the case that our value spans across multiple zoom summaries, so this inner loop handles that. + + // `add_start` indicates where we are *currently* adding bases from (either the start of this item or in the middle, but beginning of another zoom section) + let mut add_start = current_val.start; + loop { + // Write section if full; or if no next section, some items, and no current zoom record + if (add_start >= current_val.end + && zoom_item.live_info.is_none() + && next_val.is_none() + && !zoom_item.records.is_empty()) + || zoom_item.records.len() == options.items_per_slot as usize + { + let items = std::mem::take(&mut zoom_item.records); + let handle = runtime.spawn(encode_zoom_section(options.compress, items)); + zoom_item.channel.send(handle).await.expect("Couln't send"); + } + if add_start >= current_val.end { + if next_val.is_none() { + if let Some(zoom2) = zoom_item.live_info.take() { + zoom_item.records.push(zoom2); + continue; } - break; } - let val = f64::from(current_val.value); - let zoom2 = zoom_item.live_info.get_or_insert(ZoomRecord { - chrom: chrom_id, - start: add_start, - end: add_start, - summary: Summary { - total_items: 0, - bases_covered: 0, - min_val: val, - max_val: val, - sum: 0.0, - sum_squares: 0.0, - }, - }); - // The end of zoom record - let next_end = zoom2.start + zoom_item.size; - // End of bases that we could add - let add_end = std::cmp::min(next_end, current_val.end); - // If the last zoom ends before this value starts, we don't add anything - if add_end >= add_start { - let added_bases = add_end - add_start; - zoom2.end = add_end; - zoom2.summary.total_items += 1; - zoom2.summary.bases_covered += u64::from(added_bases); - zoom2.summary.min_val = zoom2.summary.min_val.min(val); - zoom2.summary.max_val = zoom2.summary.max_val.max(val); - zoom2.summary.sum += f64::from(added_bases) * val; - zoom2.summary.sum_squares += f64::from(added_bases) * val * val; - } - // If we made it to the end of the zoom (whether it was because the zoom ended before this value started, - // or we added to the end of the zoom), then write this zooms to the current section - if add_end == next_end { - zoom_item.records.push(zoom_item.live_info.take().unwrap()); - } - // Set where we would start for next time - add_start = add_end; + break; + } + let val = f64::from(current_val.value); + let zoom2 = zoom_item.live_info.get_or_insert(ZoomRecord { + chrom: chrom_id, + start: add_start, + end: add_start, + summary: Summary { + total_items: 0, + bases_covered: 0, + min_val: val, + max_val: val, + sum: 0.0, + sum_squares: 0.0, + }, + }); + // The end of zoom record + let next_end = zoom2.start + zoom_item.size; + // End of bases that we could add + let add_end = std::cmp::min(next_end, current_val.end); + // If the last zoom ends before this value starts, we don't add anything + if add_end >= add_start { + let added_bases = add_end - add_start; + zoom2.end = add_end; + zoom2.summary.total_items += 1; + zoom2.summary.bases_covered += u64::from(added_bases); + zoom2.summary.min_val = zoom2.summary.min_val.min(val); + zoom2.summary.max_val = zoom2.summary.max_val.max(val); + zoom2.summary.sum += f64::from(added_bases) * val; + zoom2.summary.sum_squares += f64::from(added_bases) * val * val; + } + // If we made it to the end of the zoom (whether it was because the zoom ended before this value started, + // or we added to the end of the zoom), then write this zooms to the current section + if add_end == next_end { + zoom_item.records.push(zoom_item.live_info.take().unwrap()); } - debug_assert_ne!(zoom_item.records.len(), options.items_per_slot as usize); + // Set where we would start for next time + add_start = add_end; } + debug_assert_ne!(zoom_item.records.len(), options.items_per_slot as usize); } } @@ -493,7 +501,7 @@ impl BBIDataProcessor for BigWigFullProcess { let options = *options; let length = *length; - BigWigWrite::process_val( + process_val( current_val, next_val, length, @@ -507,7 +515,7 @@ impl BBIDataProcessor for BigWigFullProcess { ) .await?; - BigWigWrite::process_val_zoom( + process_val_zoom( zoom_items, options, current_val, @@ -621,7 +629,7 @@ impl BBIDataProcessor for BigWigNoZoomsProcess { zoom_counts, } = self; - BigWigWrite::process_val( + process_val( current_val, next_val, *length, @@ -650,8 +658,8 @@ impl BBIDataProcessor for BigWigNoZoomsProcess { } } -struct BigWigZoomsProcess { - temp_zoom_items: Vec, +struct BigWigZoomsProcess { + temp_zoom_items: Vec>, chrom_id: u32, options: BBIWriteOptions, runtime: Handle, @@ -659,9 +667,9 @@ struct BigWigZoomsProcess { zoom_items: Vec, } -impl BBIDataProcessorCreate for BigWigZoomsProcess { - type I = ZoomsInternalProcessData; - type Out = ZoomsInternalProcessedData; +impl BBIDataProcessorCreate for BigWigZoomsProcess { + type I = ZoomsInternalProcessData; + type Out = ZoomsInternalProcessedData; fn create(internal_data: Self::I) -> Self { let ZoomsInternalProcessData(temp_zoom_items, zooms_channels, chrom_id, options, runtime) = internal_data; @@ -695,7 +703,7 @@ impl BBIDataProcessorCreate for BigWigZoomsProcess { ZoomsInternalProcessedData(self.temp_zoom_items) } } -impl BBIDataProcessor for BigWigZoomsProcess { +impl BBIDataProcessor for BigWigZoomsProcess { type Value = Value; async fn do_process( &mut self, @@ -710,7 +718,7 @@ impl BBIDataProcessor for BigWigZoomsProcess { .. } = self; - BigWigWrite::process_val_zoom( + process_val_zoom( zoom_items, *options, current_val, diff --git a/bigtools/src/utils/cli/bedgraphtobigwig.rs b/bigtools/src/utils/cli/bedgraphtobigwig.rs index eb7d54b..01098f5 100644 --- a/bigtools/src/utils/cli/bedgraphtobigwig.rs +++ b/bigtools/src/utils/cli/bedgraphtobigwig.rs @@ -64,12 +64,6 @@ pub fn bedgraphtobigwig(args: BedGraphToBigWigArgs) -> Result<(), Box } }; - let mut outb = BigWigWrite::create_file(bigwigpath); - outb.options.max_zooms = args.write_args.nzooms; - outb.options.compress = !args.write_args.uncompressed; - outb.options.input_sort_type = input_sort_type; - outb.options.block_size = args.write_args.block_size; - outb.options.inmemory = args.write_args.inmemory; let chrom_map: HashMap = BufReader::new(File::open(chrom_map)?) .lines() .filter(|l| match l { @@ -86,6 +80,13 @@ pub fn bedgraphtobigwig(args: BedGraphToBigWigArgs) -> Result<(), Box }) .collect(); + let mut outb = BigWigWrite::create_file(bigwigpath, chrom_map)?; + outb.options.max_zooms = args.write_args.nzooms; + outb.options.compress = !args.write_args.uncompressed; + outb.options.input_sort_type = input_sort_type; + outb.options.block_size = args.write_args.block_size; + outb.options.inmemory = args.write_args.inmemory; + let runtime = if nthreads == 1 { outb.options.channel_size = 0; runtime::Builder::new_current_thread().build().unwrap() @@ -100,7 +101,7 @@ pub fn bedgraphtobigwig(args: BedGraphToBigWigArgs) -> Result<(), Box if bedgraphpath == "-" || bedgraphpath == "stdin" { let stdin = std::io::stdin().lock(); let vals = BedParserStreamingIterator::from_bedgraph_file(stdin, allow_out_of_order_chroms); - outb.write(chrom_map, vals, runtime)?; + outb.write(vals, runtime)?; } else { let infile = File::open(&bedgraphpath)?; let (parallel, parallel_required) = match (nthreads, args.parallel.as_ref()) { @@ -139,7 +140,7 @@ pub fn bedgraphtobigwig(args: BedGraphToBigWigArgs) -> Result<(), Box PathBuf::from(bedgraphpath), parse_bedgraph, ); - outb.write(chrom_map, data, runtime)?; + outb.write(data, runtime)?; } else { outb.write_multipass( || { @@ -152,7 +153,6 @@ pub fn bedgraphtobigwig(args: BedGraphToBigWigArgs) -> Result<(), Box Ok(data) }, - chrom_map, runtime, )?; } @@ -163,7 +163,7 @@ pub fn bedgraphtobigwig(args: BedGraphToBigWigArgs) -> Result<(), Box infile, allow_out_of_order_chroms, ); - outb.write(chrom_map, vals, runtime)?; + outb.write(vals, runtime)?; } else { outb.write_multipass( || { @@ -173,7 +173,6 @@ pub fn bedgraphtobigwig(args: BedGraphToBigWigArgs) -> Result<(), Box allow_out_of_order_chroms, )) }, - chrom_map, runtime, )?; } diff --git a/bigtools/src/utils/cli/bedtobigbed.rs b/bigtools/src/utils/cli/bedtobigbed.rs index 45723aa..e0a0b3c 100644 --- a/bigtools/src/utils/cli/bedtobigbed.rs +++ b/bigtools/src/utils/cli/bedtobigbed.rs @@ -72,11 +72,6 @@ pub fn bedtobigbed(args: BedToBigBedArgs) -> Result<(), Box> { } }; - let mut outb = BigBedWrite::create_file(bigwigpath); - outb.options.max_zooms = args.write_args.nzooms; - outb.options.compress = !args.write_args.uncompressed; - outb.options.input_sort_type = input_sort_type; - outb.options.inmemory = args.write_args.inmemory; let chrom_map: HashMap = BufReader::new(File::open(chrom_map)?) .lines() .filter(|l| match l { @@ -93,6 +88,11 @@ pub fn bedtobigbed(args: BedToBigBedArgs) -> Result<(), Box> { }) .collect(); + let mut outb = BigBedWrite::create_file(bigwigpath, chrom_map)?; + outb.options.max_zooms = args.write_args.nzooms; + outb.options.compress = !args.write_args.uncompressed; + outb.options.input_sort_type = input_sort_type; + outb.options.inmemory = args.write_args.inmemory; let runtime = if nthreads == 1 { outb.options.channel_size = 0; runtime::Builder::new_current_thread().build().unwrap() @@ -117,7 +117,7 @@ pub fn bedtobigbed(args: BedToBigBedArgs) -> Result<(), Box> { if bedpath == "-" || bedpath == "stdin" { let stdin = std::io::stdin().lock(); let data = BedParserStreamingIterator::from_bed_file(stdin, allow_out_of_order_chroms); - outb.write(chrom_map, data, runtime)?; + outb.write(data, runtime)?; } else { let infile = File::open(&bedpath)?; let (parallel, parallel_required) = match (nthreads, args.parallel.as_ref()) { @@ -156,7 +156,7 @@ pub fn bedtobigbed(args: BedToBigBedArgs) -> Result<(), Box> { PathBuf::from(bedpath), parse_bed, ); - outb.write(chrom_map, data, runtime)?; + outb.write(data, runtime)?; } else { outb.write_multipass( || { @@ -169,7 +169,6 @@ pub fn bedtobigbed(args: BedToBigBedArgs) -> Result<(), Box> { Ok(data) }, - chrom_map, runtime, )?; } @@ -178,7 +177,7 @@ pub fn bedtobigbed(args: BedToBigBedArgs) -> Result<(), Box> { if args.single_pass { let data = BedParserStreamingIterator::from_bed_file(infile, allow_out_of_order_chroms); - outb.write(chrom_map, data, runtime)?; + outb.write(data, runtime)?; } else { outb.write_multipass( || { @@ -190,7 +189,6 @@ pub fn bedtobigbed(args: BedToBigBedArgs) -> Result<(), Box> { Ok(data) }, - chrom_map, runtime, )?; } diff --git a/bigtools/src/utils/cli/bigwigaverageoverbed.rs b/bigtools/src/utils/cli/bigwigaverageoverbed.rs index 9753df5..cd17680 100644 --- a/bigtools/src/utils/cli/bigwigaverageoverbed.rs +++ b/bigtools/src/utils/cli/bigwigaverageoverbed.rs @@ -69,8 +69,8 @@ pub fn bigwigaverageoverbed( let add_min_max = args.min_max; let reopen = ReopenableFile { - path: bigwigpath.to_string(), - file: File::open(bigwigpath)?, + file: File::open(&bigwigpath)?, + path: bigwigpath.into(), }; let mut inbigwig = BigWigRead::open(reopen)?.cached(); diff --git a/bigtools/src/utils/cli/bigwigmerge.rs b/bigtools/src/utils/cli/bigwigmerge.rs index 1ee5871..da446a4 100644 --- a/bigtools/src/utils/cli/bigwigmerge.rs +++ b/bigtools/src/utils/cli/bigwigmerge.rs @@ -122,7 +122,7 @@ pub fn bigwigmerge(args: BigWigMergeArgs) -> Result<(), Box> { }; match output_type { OutputType::BigWig => { - let outb = BigWigWrite::create_file(output); + let outb = BigWigWrite::create_file(output, chrom_map)?; let runtime = if nthreads == 1 { runtime::Builder::new_current_thread().build().unwrap() } else { @@ -134,7 +134,7 @@ pub fn bigwigmerge(args: BigWigMergeArgs) -> Result<(), Box> { let all_values = ChromGroupReadImpl { iter: Box::new(iter), }; - outb.write(chrom_map, all_values, runtime)?; + outb.write(all_values, runtime)?; } OutputType::BedGraph => { // TODO: convert to multi-threaded @@ -263,7 +263,7 @@ pub fn get_merged_vals( } } // We don't want to a new file descriptor for every chrom - bws.push((w.info().clone(), w.inner_read().path.to_string())); + bws.push((w.info().clone(), w.inner_read().path.clone())); } let size = size.unwrap(); diff --git a/bigtools/src/utils/file/reopen.rs b/bigtools/src/utils/file/reopen.rs index 4ab4b4a..1e37a8b 100644 --- a/bigtools/src/utils/file/reopen.rs +++ b/bigtools/src/utils/file/reopen.rs @@ -1,5 +1,6 @@ use std::fs::File; use std::io::{self, Read, Seek}; +use std::path::PathBuf; /// A helper trait that for things that implement `Read`, `Seek`, and `Send` pub trait SeekableRead: Seek + Read {} @@ -12,7 +13,7 @@ pub trait Reopen: Sized { } pub struct ReopenableFile { - pub path: String, + pub path: PathBuf, pub file: File, } diff --git a/bigtools/tests/bigbedwrite.rs b/bigtools/tests/bigbedwrite.rs index 7c3f18f..ba3bd4a 100644 --- a/bigtools/tests/bigbedwrite.rs +++ b/bigtools/tests/bigbedwrite.rs @@ -32,7 +32,13 @@ fn bigbedwrite_test() -> Result<(), Box> { .expect("Unable to create runtime."); let tempfile = tempfile::NamedTempFile::new()?; - let mut outb = BigBedWrite::create_file(tempfile.path().to_string_lossy().to_string()); + + let mut chrom_map = HashMap::new(); + chrom_map.insert("chr17".to_string(), 83257441); + chrom_map.insert("chr18".to_string(), 80373285); + chrom_map.insert("chr19".to_string(), 58617616); + + let mut outb = BigBedWrite::create_file(tempfile.path(), chrom_map).unwrap(); outb.autosql = { let infile = File::open(&bed)?; let mut vals_iter = BedFileStream::from_bed_file(infile); @@ -42,16 +48,11 @@ fn bigbedwrite_test() -> Result<(), Box> { }; outb.options.compress = false; - let mut chrom_map = HashMap::new(); - chrom_map.insert("chr17".to_string(), 83257441); - chrom_map.insert("chr18".to_string(), 80373285); - chrom_map.insert("chr19".to_string(), 58617616); - let infile = File::open(bed)?; let data = BedParserStreamingIterator::from_bed_file(infile, false); - outb.write(chrom_map, data, runtime).unwrap(); + outb.write(data, runtime).unwrap(); - let mut bwread = BigBedRead::open_file(&tempfile.path().to_string_lossy()).unwrap(); + let mut bwread = BigBedRead::open_file(tempfile.path()).unwrap(); let chroms = bwread.chroms(); assert_eq!(chroms.len(), 3); diff --git a/bigtools/tests/bigwigread.rs b/bigtools/tests/bigwigread.rs index d39e390..77ae7aa 100644 --- a/bigtools/tests/bigwigread.rs +++ b/bigtools/tests/bigwigread.rs @@ -12,7 +12,7 @@ fn test_valid_read() -> Result<(), Box> { let mut valid_bigwig = dir.clone(); valid_bigwig.push("valid.bigWig"); - let mut bwread = BigWigRead::open_file(&valid_bigwig.to_string_lossy()).unwrap(); + let mut bwread = BigWigRead::open_file(valid_bigwig).unwrap(); // Test that chrom tree parsing works let chroms = bwread.chroms(); @@ -50,7 +50,7 @@ fn test_values() -> Result<(), Box> { let mut valid_bigwig = dir.clone(); valid_bigwig.push("valid.bigWig"); - let mut bwread = BigWigRead::open_file(&valid_bigwig.to_string_lossy()).unwrap(); + let mut bwread = BigWigRead::open_file(valid_bigwig).unwrap(); let vals = bwread.values("chr17", 0, 59899)?; assert_eq!(vals.len(), 59899); @@ -70,7 +70,7 @@ fn test_reduction_values() -> Result<(), Box> { let mut valid_bigwig = dir.clone(); valid_bigwig.push("valid.bigWig"); - let mut bwread = BigWigRead::open_file(&valid_bigwig.to_string_lossy()).unwrap(); + let mut bwread = BigWigRead::open_file(valid_bigwig).unwrap(); let interval = bwread.get_zoom_interval("chr17", 0, 36996442, 10240); let x: Vec<_> = interval.unwrap().collect(); diff --git a/bigtools/tests/bigwigwrite.rs b/bigtools/tests/bigwigwrite.rs index 756cbe4..6ab652d 100644 --- a/bigtools/tests/bigwigwrite.rs +++ b/bigtools/tests/bigwigwrite.rs @@ -32,15 +32,16 @@ fn test() -> Result<(), Box> { let infile = File::open(single_chrom_bedgraph)?; let tempfile = tempfile::NamedTempFile::new()?; - let outb = BigWigWrite::create_file(tempfile.path().to_string_lossy().to_string()); let mut chrom_map = HashMap::new(); chrom_map.insert("chr17".to_string(), 83257441); + let outb = BigWigWrite::create_file(tempfile.path(), chrom_map)?; + let data = BedParserStreamingIterator::from_bedgraph_file(infile, false); - outb.write(chrom_map, data, runtime).unwrap(); + outb.write(data, runtime).unwrap(); - let mut bwread = BigWigRead::open_file(&tempfile.path().to_string_lossy()).unwrap(); + let mut bwread = BigWigRead::open_file(tempfile.path()).unwrap(); let chroms = bwread.chroms(); assert_eq!(chroms.len(), 1); @@ -77,23 +78,23 @@ fn test_multi_pass() -> Result<(), Box> { let tempfile = tempfile::NamedTempFile::new()?; - let outb = BigWigWrite::create_file(tempfile.path().to_string_lossy().to_string()); - let mut chrom_map = HashMap::new(); chrom_map.insert("chr17".to_string(), 83257441); + let outb = + BigWigWrite::create_file(tempfile.path().to_string_lossy().to_string(), chrom_map).unwrap(); + outb.write_multipass( || { let infile = File::open(single_chrom_bedgraph.clone())?; let data = BedParserStreamingIterator::from_bedgraph_file(infile, false); Ok(data) }, - chrom_map, runtime, ) .unwrap(); - let mut bwread = BigWigRead::open_file(&tempfile.path().to_string_lossy()).unwrap(); + let mut bwread = BigWigRead::open_file(tempfile.path()).unwrap(); let chroms = bwread.chroms(); assert_eq!(chroms.len(), 1); @@ -124,7 +125,6 @@ fn test_multi_chrom() -> io::Result<()> { let infile = File::open(multi_chrom_bedgraph)?; let tempfile = tempfile::NamedTempFile::new()?; - let outb = BigWigWrite::create_file(tempfile.path().to_string_lossy().to_string()); let mut chrom_map = HashMap::new(); chrom_map.insert("chr1".to_string(), 248956422); @@ -134,10 +134,12 @@ fn test_multi_chrom() -> io::Result<()> { chrom_map.insert("chr5".to_string(), 181538259); chrom_map.insert("chr6".to_string(), 170805979); + let outb = BigWigWrite::create_file(tempfile.path().to_string_lossy().to_string(), chrom_map)?; + let data = BedParserStreamingIterator::from_bedgraph_file(infile, false); - outb.write(chrom_map, data, runtime).unwrap(); + outb.write(data, runtime).unwrap(); - let mut bwread = BigWigRead::open_file(&tempfile.path().to_string_lossy()).unwrap(); + let mut bwread = BigWigRead::open_file(tempfile.path()).unwrap(); let chroms = bwread.chroms(); assert_eq!(chroms.len(), 6); @@ -185,6 +187,7 @@ fn test_iter() { .expect("Unable to create runtime."); let tempfile = tempfile::NamedTempFile::new().unwrap(); - let outb = BigWigWrite::create_file(tempfile.path().to_string_lossy().to_string()); - outb.write(chrom_map, vals_iter, runtime).unwrap(); + let outb = + BigWigWrite::create_file(tempfile.path().to_string_lossy().to_string(), chrom_map).unwrap(); + outb.write(vals_iter, runtime).unwrap(); } diff --git a/pybigtools/src/lib.rs b/pybigtools/src/lib.rs index daba1c3..90ce53d 100644 --- a/pybigtools/src/lib.rs +++ b/pybigtools/src/lib.rs @@ -1898,7 +1898,7 @@ impl BigBedEntriesIterator { /// Interface for writing to a BigWig file. #[pyclass(module = "pybigtools")] struct BigWigWrite { - bigwig: Option, + bigwig: Option, } #[pymethods] @@ -1931,10 +1931,6 @@ impl BigWigWrite { .build() .expect("Unable to create thread pool."); - let bigwig = self - .bigwig - .take() - .ok_or_else(|| PyErr::new::("File already closed."))?; let chrom_map = chroms .into_iter() .map(|(key, val)| { @@ -1943,6 +1939,18 @@ impl BigWigWrite { Ok((chrom, length)) }) .collect::, pyo3::PyDowncastError>>()?; + + let bigwig = self + .bigwig + .take() + .ok_or_else(|| PyErr::new::("Can only write once."))?; + let bigwig = BigWigWriteRaw::create_file(bigwig, chrom_map).map_err(|e| { + PyErr::new::(format!( + "Error occured when creating file: {}", + e + )) + })?; + struct IterError(String); struct Iter { inner: PyObject, @@ -2027,7 +2035,7 @@ impl BigWigWrite { Ok(v) => Ok(v), }); let data = BedParserStreamingIterator::wrap_iter(vals_iter_raw, true); - match bigwig.write(chrom_map, data, runtime) { + match bigwig.write(data, runtime) { Err(e) => println!("{}", e), Ok(_) => {} } @@ -2048,7 +2056,7 @@ impl BigWigWrite { /// Interface for writing to a BigBed file. #[pyclass(module = "pybigtools")] struct BigBedWrite { - bigbed: Option, + bigbed: Option, } #[pymethods] @@ -2082,10 +2090,6 @@ impl BigBedWrite { .build() .expect("Unable to create thread pool."); - let bigbed = self - .bigbed - .take() - .ok_or_else(|| PyErr::new::("File already closed."))?; let chrom_map = chroms .into_iter() .map(|(key, val)| { @@ -2094,6 +2098,18 @@ impl BigBedWrite { Ok((chrom, length)) }) .collect::, pyo3::PyDowncastError>>()?; + + let bigbed = self + .bigbed + .take() + .ok_or_else(|| PyErr::new::("File already closed."))?; + let bigbed = BigBedWriteRaw::create_file(bigbed, chrom_map).map_err(|e| { + PyErr::new::(format!( + "Error occured when creating file: {}", + e + )) + })?; + struct IterError(String); struct Iter { inner: PyObject, @@ -2179,7 +2195,7 @@ impl BigBedWrite { Ok(v) => Ok(v), }); let data = BedParserStreamingIterator::wrap_iter(vals_iter_raw, true); - match bigbed.write(chrom_map, data, runtime) { + match bigbed.write(data, runtime) { Err(e) => { println!("{}", e) } @@ -2404,11 +2420,11 @@ fn open_path_or_url( } match extension.as_ref() { "bw" | "bigWig" | "bigwig" => BigWigWrite { - bigwig: Some(BigWigWriteRaw::create_file(path_url_or_file_like)), + bigwig: Some(path_url_or_file_like), } .into_py(py), "bb" | "bigBed" | "bigbed" => BigBedWrite { - bigbed: Some(BigBedWriteRaw::create_file(path_url_or_file_like)), + bigbed: Some(path_url_or_file_like), } .into_py(py), _ => {