Skip to content

Commit

Permalink
Initial orc-metadata CLI tool
Browse files Browse the repository at this point in the history
  • Loading branch information
Jefffrey committed Mar 25, 2024
1 parent ff0f36e commit 8537adb
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 0 deletions.
6 changes: 6 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ rust-version = "1.70"
[dependencies]
arrow = { version = "50", features = ["prettyprint"] }
bytes = "1.4"
clap = { version = "4.5.3", features = ["derive"], optional = true }
fallible-streaming-iterator = { version = "0.1" }
flate2 = "1"
futures = { version = "0.3", optional = true, default-features = false, features = ["std"] }
Expand Down Expand Up @@ -54,6 +55,7 @@ serde_json = { version = "1.0", default-features = false, features = ["std"] }
default = ["async"]

async = ["futures", "futures-util", "tokio"]
cli = ["clap"]

[[bench]]
name = "arrow_reader"
Expand All @@ -63,3 +65,7 @@ required-features = ["async"]
[[example]]
name = "datafusion_integration"
required-features = ["async"]

[[bin]]
name = "orc-metadata"
required-features = ["cli"]
35 changes: 35 additions & 0 deletions src/bin/orc-metadata.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
use std::{error::Error, fs::File, path::PathBuf};

use clap::Parser;
use datafusion_orc::ArrowReaderBuilder;

#[derive(Parser)]
#[command(version, about, long_about = None)]
struct Cli {
/// ORC file path
file: PathBuf,
}

fn main() -> Result<(), Box<dyn Error>> {
let cli = Cli::parse();

let f = File::open(cli.file)?;
let builder = ArrowReaderBuilder::try_new(f)?;
let metadata = builder.file_metadata();

// TODO: better way to handle this printing?
// TODO: move this display to actual library
println!(
"compression: {}",
metadata
.compression()
.map(|c| c.compression_type().to_string())
.unwrap_or("None".to_string())
);
println!("number of rows: {}", metadata.number_of_rows());
println!("number of stripes: {}", metadata.stripe_metadatas().len());
// TODO: nesting types indentation is messed up
println!("schema:\n{}", metadata.root_data_type());

Ok(())
}
10 changes: 10 additions & 0 deletions src/reader/decompress.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ pub struct Compression {
}

impl Compression {
pub fn compression_type(&self) -> CompressionType {
self.compression_type
}

pub fn from_proto(
kind: proto::CompressionKind,
compression_block_size: Option<u64>,
Expand Down Expand Up @@ -63,6 +67,12 @@ pub enum CompressionType {
Zstd,
}

impl std::fmt::Display for CompressionType {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{:?}", self)
}
}

/// Indicates length of block and whether it's compressed or not.
#[derive(Debug, PartialEq, Eq)]
enum CompressionHeader {
Expand Down

0 comments on commit 8537adb

Please sign in to comment.