From 58a11a128d29db0874d37b928c2ff6077f430eac Mon Sep 17 00:00:00 2001 From: jdidion Date: Tue, 7 Nov 2023 10:33:49 -0800 Subject: [PATCH 1/6] WIP --- src/io/mod.rs | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/src/io/mod.rs b/src/io/mod.rs index 1d7c2b6..a75f184 100644 --- a/src/io/mod.rs +++ b/src/io/mod.rs @@ -79,7 +79,7 @@ impl Io { } /// Returns true if the path ends with a recognized GZIP file extension - fn is_gzip_path>(p: &P) -> bool { + fn is_gzip_path>(p: P) -> bool { if let Some(ext) = p.as_ref().extension() { match ext.to_str() { Some(x) => GZIP_EXTENSIONS.contains(&x), @@ -92,7 +92,7 @@ impl Io { /// Opens a file for reading. Transparently handles reading gzipped files based /// extension. - pub fn new_reader

(&self, p: &P) -> Result> + pub fn new_reader

(&self, p: P) -> Result> where P: AsRef, { @@ -108,7 +108,7 @@ impl Io { /// Opens a file for writing. Transparently handles writing GZIP'd data if the file /// ends with a recognized GZIP extension. - pub fn new_writer

(&self, p: &P) -> Result>> + pub fn new_writer

(&self, p: P) -> Result>> where P: AsRef, { @@ -167,6 +167,30 @@ impl Default for DelimFile { } impl DelimFile { + /// Returns a new `DelimFileReader` instance that reads from the given path, opened with this + /// `DelimFile`'s `Io` instance. + pub fn new_reader>( + &self, + path: P, + delimiter: u8, + quote: bool, + ) -> Result> { + let file = self.io.new_reader(path)?; + Ok(DelimFileReader::new(file, delimiter, quote)) + } + + /// Returns a new `DelimFileWriter` instance that writes to the given path, opened with this + /// `DelimFile`'s `Io` instance. + pub fn new_writer>( + &self, + path: P, + delimiter: u8, + quote: bool, + ) -> Result> { + let file = self.io.new_writer(path)?; + Ok(DelimFileWriter::new(file, delimiter, quote)) + } + /// Writes a series of one or more structs to a delimited file. If `quote` is true then fields /// will be quoted as necessary, otherwise they will never be quoted. pub fn write( From b823ae1bf512956126e1d710d70c79a7b24fcc81 Mon Sep 17 00:00:00 2001 From: jdidion Date: Tue, 7 Nov 2023 13:06:44 -0800 Subject: [PATCH 2/6] implement DelimFileReader, DelimFileWriter; rewrite DelimFile using new reader/writer structs --- src/io/mod.rs | 100 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 67 insertions(+), 33 deletions(-) diff --git a/src/io/mod.rs b/src/io/mod.rs index a75f184..5acaac8 100644 --- a/src/io/mod.rs +++ b/src/io/mod.rs @@ -43,10 +43,11 @@ //! ``` use std::fs::File; use std::io::{BufRead, BufReader, BufWriter, Write}; +use std::marker::PhantomData; use std::path::Path; use crate::{FgError, Result}; -use csv::{QuoteStyle, ReaderBuilder, WriterBuilder}; +use csv::{QuoteStyle, ReaderBuilder, DeserializeRecordsIntoIter, WriterBuilder, Writer}; use flate2::bufread::MultiGzDecoder; use flate2::write::GzEncoder; use flate2::Compression; @@ -96,7 +97,7 @@ impl Io { where P: AsRef, { - let file = File::open(p).map_err(FgError::IoError)?; + let file = File::open(p.as_ref()).map_err(FgError::IoError)?; let buf = BufReader::with_capacity(self.buffer_size, file); if Self::is_gzip_path(p) { @@ -112,7 +113,7 @@ impl Io { where P: AsRef, { - let file = File::create(p).map_err(FgError::IoError)?; + let file = File::create(p.as_ref()).map_err(FgError::IoError)?; let write: Box = if Io::is_gzip_path(p) { Box::new(GzEncoder::new(file, self.compression)) } else { @@ -152,6 +153,66 @@ impl Io { } } +pub struct DelimFileReader { + record_iter: DeserializeRecordsIntoIter, D>, +} + +impl DelimFileReader { + pub fn new(reader: Box, delimiter: u8, quote: bool) -> Self { + let csv_reader = ReaderBuilder::new() + .delimiter(delimiter) + .has_headers(true) + .quoting(quote) + .from_reader(reader); + let record_iter = csv_reader.into_deserialize(); + Self { record_iter } + } + + pub fn read(&mut self) -> Option> { + self.record_iter.next().map(|result| result.map_err(FgError::ConversionError)) + } +} + +impl Iterator for DelimFileReader { + type Item = Result; + + fn next(&mut self) -> Option { + self.read() + } +} + +pub struct DelimFileWriter { + csv_writer: Writer>>, + _data: PhantomData, +} + +impl DelimFileWriter { + pub fn new(writer: BufWriter>, delimiter: u8, quote: bool) -> Self { + let csv_writer = WriterBuilder::new() + .delimiter(delimiter) + .has_headers(true) + .quote_style(if quote { QuoteStyle::Necessary } else { QuoteStyle::Never }) + .from_writer(writer); + Self { csv_writer, _data: PhantomData } + } + + pub fn write(&mut self, rec: &S) -> Result<()> { + self.csv_writer.serialize(rec).map_err(FgError::ConversionError) + } + + pub fn write_all(&mut self, iter: impl IntoIterator) -> Result<()> { + for rec in iter { + self.write(&rec)?; + } + self.flush()?; + Ok(()) + } + + pub fn flush(&mut self) -> Result<()> { + self.csv_writer.flush().map_err(FgError::IoError) + } +} + /// Unit-struct that contains associated functions for reading and writing Structs to/from /// delimited files. Structs should use serde's Serialize/Deserialize derive macros in /// order to be used with these functions. @@ -169,7 +230,7 @@ impl Default for DelimFile { impl DelimFile { /// Returns a new `DelimFileReader` instance that reads from the given path, opened with this /// `DelimFile`'s `Io` instance. - pub fn new_reader>( + pub fn new_reader>( &self, path: P, delimiter: u8, @@ -204,19 +265,7 @@ impl DelimFile { S: Serialize, P: AsRef, { - let write = self.io.new_writer(path)?; - - let mut writer = WriterBuilder::new() - .delimiter(delimiter) - .has_headers(true) - .quote_style(if quote { QuoteStyle::Necessary } else { QuoteStyle::Never }) - .from_writer(write); - - for rec in recs { - writer.serialize(rec).map_err(FgError::ConversionError)?; - } - - writer.flush().map_err(FgError::IoError) + self.new_writer(path, delimiter, quote)?.write_all(recs) } /// Writes structs implementing `[Serialize]` to a file with tab separators between fields. @@ -245,22 +294,7 @@ impl DelimFile { D: DeserializeOwned, P: AsRef, { - let read = self.io.new_reader(path)?; - - let mut reader = ReaderBuilder::new() - .delimiter(delimiter) - .has_headers(true) - .quoting(quote) - .from_reader(read); - - let mut results = vec![]; - - for result in reader.deserialize::() { - let rec = result.map_err(FgError::ConversionError)?; - results.push(rec); - } - - Ok(results) + self.new_reader(path, delimiter, quote)?.collect() } /// Reads structs implementing `[Deserialize]` from a file with tab separators between fields. From cc9e7888b3ff0e3c58959632232bd67afd588e06 Mon Sep 17 00:00:00 2001 From: jdidion Date: Tue, 7 Nov 2023 13:12:36 -0800 Subject: [PATCH 3/6] formatting --- src/io/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/io/mod.rs b/src/io/mod.rs index 5acaac8..a37c261 100644 --- a/src/io/mod.rs +++ b/src/io/mod.rs @@ -47,7 +47,7 @@ use std::marker::PhantomData; use std::path::Path; use crate::{FgError, Result}; -use csv::{QuoteStyle, ReaderBuilder, DeserializeRecordsIntoIter, WriterBuilder, Writer}; +use csv::{DeserializeRecordsIntoIter, QuoteStyle, ReaderBuilder, Writer, WriterBuilder}; use flate2::bufread::MultiGzDecoder; use flate2::write::GzEncoder; use flate2::Compression; From 59477d75090290d3d4757c77167ff8630bbed19c Mon Sep 17 00:00:00 2001 From: jdidion Date: Tue, 7 Nov 2023 13:19:44 -0800 Subject: [PATCH 4/6] add comments --- src/io/mod.rs | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/io/mod.rs b/src/io/mod.rs index a37c261..e8712de 100644 --- a/src/io/mod.rs +++ b/src/io/mod.rs @@ -153,11 +153,15 @@ impl Io { } } +/// A struct that wraps a csv `Reader` and provides methods for reading one record at a time. +/// It also implements `Iterator`. pub struct DelimFileReader { record_iter: DeserializeRecordsIntoIter, D>, } impl DelimFileReader { + /// Returns a new `DelimFileReader` that will read records from the given reader with the given + /// delimiter and quoting. Assumes the input file has a header row. pub fn new(reader: Box, delimiter: u8, quote: bool) -> Self { let csv_reader = ReaderBuilder::new() .delimiter(delimiter) @@ -168,6 +172,7 @@ impl DelimFileReader { Self { record_iter } } + /// Returns the next record from the underlying reader. pub fn read(&mut self) -> Option> { self.record_iter.next().map(|result| result.map_err(FgError::ConversionError)) } @@ -181,12 +186,16 @@ impl Iterator for DelimFileReader { } } +/// A struct that wraps a csv `Writer` and provides methods for writing single records as well as +/// multiple records from an iterator. pub struct DelimFileWriter { csv_writer: Writer>>, _data: PhantomData, } impl DelimFileWriter { + /// Returns a new `DelimFileWriter` that writes to the given `writer` with the given delimiter + /// and quoting. The output file will have a header row. pub fn new(writer: BufWriter>, delimiter: u8, quote: bool) -> Self { let csv_writer = WriterBuilder::new() .delimiter(delimiter) @@ -196,10 +205,12 @@ impl DelimFileWriter { Self { csv_writer, _data: PhantomData } } + /// Writes a single record to the underlying writer. pub fn write(&mut self, rec: &S) -> Result<()> { self.csv_writer.serialize(rec).map_err(FgError::ConversionError) } + /// Writes all records from `iter` to the underlying writer, in order. pub fn write_all(&mut self, iter: impl IntoIterator) -> Result<()> { for rec in iter { self.write(&rec)?; @@ -208,6 +219,9 @@ impl DelimFileWriter { Ok(()) } + /// Flushes the underlying writer. + /// Note: this is not strictly necessary as the underlying writer is flushed automatically + /// on `Drop`. pub fn flush(&mut self) -> Result<()> { self.csv_writer.flush().map_err(FgError::IoError) } From 0162ae92f0d65f21c20de6854036fbd7ebd33c0e Mon Sep 17 00:00:00 2001 From: jdidion Date: Tue, 7 Nov 2023 14:21:31 -0800 Subject: [PATCH 5/6] Add header field to reader --- src/io/mod.rs | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/io/mod.rs b/src/io/mod.rs index e8712de..76538b9 100644 --- a/src/io/mod.rs +++ b/src/io/mod.rs @@ -47,7 +47,7 @@ use std::marker::PhantomData; use std::path::Path; use crate::{FgError, Result}; -use csv::{DeserializeRecordsIntoIter, QuoteStyle, ReaderBuilder, Writer, WriterBuilder}; +use csv::{DeserializeRecordsIntoIter, QuoteStyle, ReaderBuilder, Writer, WriterBuilder, StringRecord}; use flate2::bufread::MultiGzDecoder; use flate2::write::GzEncoder; use flate2::Compression; @@ -157,19 +157,27 @@ impl Io { /// It also implements `Iterator`. pub struct DelimFileReader { record_iter: DeserializeRecordsIntoIter, D>, + header: StringRecord, } impl DelimFileReader { /// Returns a new `DelimFileReader` that will read records from the given reader with the given /// delimiter and quoting. Assumes the input file has a header row. - pub fn new(reader: Box, delimiter: u8, quote: bool) -> Self { - let csv_reader = ReaderBuilder::new() + pub fn new(reader: Box, delimiter: u8, quote: bool) -> Result { + let mut csv_reader = ReaderBuilder::new() .delimiter(delimiter) .has_headers(true) .quoting(quote) .from_reader(reader); + assert!(csv_reader.has_headers(), "Expected input file to have a header row"); + let header = csv_reader.headers().map_err(FgError::ConversionError)?.to_owned(); let record_iter = csv_reader.into_deserialize(); - Self { record_iter } + Ok(Self { record_iter, header }) + } + + /// Returns the contents of the header row. + pub fn header(&self) -> &StringRecord { + &self.header } /// Returns the next record from the underlying reader. @@ -251,7 +259,7 @@ impl DelimFile { quote: bool, ) -> Result> { let file = self.io.new_reader(path)?; - Ok(DelimFileReader::new(file, delimiter, quote)) + DelimFileReader::new(file, delimiter, quote) } /// Returns a new `DelimFileWriter` instance that writes to the given path, opened with this From c9b4a0a3598f808b5af95f99c4755ec68bfc1b25 Mon Sep 17 00:00:00 2001 From: jdidion Date: Tue, 7 Nov 2023 14:24:23 -0800 Subject: [PATCH 6/6] formatting --- src/io/mod.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/io/mod.rs b/src/io/mod.rs index 76538b9..58e7eb9 100644 --- a/src/io/mod.rs +++ b/src/io/mod.rs @@ -47,7 +47,9 @@ use std::marker::PhantomData; use std::path::Path; use crate::{FgError, Result}; -use csv::{DeserializeRecordsIntoIter, QuoteStyle, ReaderBuilder, Writer, WriterBuilder, StringRecord}; +use csv::{ + DeserializeRecordsIntoIter, QuoteStyle, ReaderBuilder, StringRecord, Writer, WriterBuilder, +}; use flate2::bufread::MultiGzDecoder; use flate2::write::GzEncoder; use flate2::Compression;