From 24bd57ecf49f710c40d6915321e5245021ed41fb Mon Sep 17 00:00:00 2001 From: Jefffrey Date: Wed, 27 Mar 2024 07:39:00 +1100 Subject: [PATCH] Enhance orc-metadata bin to show basic stripe metadata --- src/bin/orc-metadata.rs | 37 ++++++++++++++++++++++++++++++++----- src/stripe.rs | 4 ++++ 2 files changed, 36 insertions(+), 5 deletions(-) diff --git a/src/bin/orc-metadata.rs b/src/bin/orc-metadata.rs index dcb59df8..58903c31 100644 --- a/src/bin/orc-metadata.rs +++ b/src/bin/orc-metadata.rs @@ -1,21 +1,24 @@ -use std::{error::Error, fs::File, path::PathBuf}; +use std::{error::Error, fs::File, path::PathBuf, sync::Arc}; use clap::Parser; -use datafusion_orc::ArrowReaderBuilder; +use datafusion_orc::{reader::metadata::read_metadata, stripe::Stripe}; #[derive(Parser)] #[command(version, about, long_about = None)] struct Cli { /// ORC file path file: PathBuf, + + /// Display data for all stripes + #[arg(short, long)] + stripes: bool, } fn main() -> Result<(), Box> { let cli = Cli::parse(); - let f = File::open(cli.file)?; - let builder = ArrowReaderBuilder::try_new(f)?; - let metadata = builder.file_metadata(); + let mut f = File::open(cli.file)?; + let metadata = Arc::new(read_metadata(&mut f)?); // TODO: better way to handle this printing? // TODO: move this display to actual library @@ -31,5 +34,29 @@ fn main() -> Result<(), Box> { // TODO: nesting types indentation is messed up println!("schema:\n{}", metadata.root_data_type()); + if cli.stripes { + println!("\n=== Stripes ==="); + for (i, stripe_metadata) in metadata.stripe_metadatas().iter().enumerate() { + let stripe = Stripe::new( + &mut f, + &metadata, + metadata.root_data_type(), + i, + stripe_metadata, + )?; + println!("stripe index: {i}"); + println!("number of rows: {}", stripe.number_of_rows()); + println!( + "writer timezone: {}", + stripe + .footer() + .writer_timezone + .clone() + .unwrap_or("None".to_string()) + ); + println!(); + } + } + Ok(()) } diff --git a/src/stripe.rs b/src/stripe.rs index 8546445e..2aa75cf3 100644 --- a/src/stripe.rs +++ b/src/stripe.rs @@ -153,6 +153,10 @@ impl Stripe { pub fn stripe_offset(&self) -> usize { self.stripe_offset } + + pub fn number_of_rows(&self) -> usize { + self.number_of_rows + } } #[derive(Debug)]