Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Schema POC #155

Merged
merged 4 commits into from
Feb 2, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/cmd/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ pub mod rename;
pub mod replace;
pub mod reverse;
pub mod sample;
pub mod schema;
pub mod search;
pub mod searchset;
pub mod select;
Expand Down
242 changes: 242 additions & 0 deletions src/cmd/schema.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,242 @@
use crate::config::{Config, Delimiter};
use crate::util;
use crate::CliError;
use crate::CliResult;
use crate::cmd::stats::{FieldType};
use csv::ByteRecord;
use indicatif::{ProgressBar, ProgressDrawTarget};
use log::{debug, error};
use serde::Deserialize;
use serde_json::{Map, Value, json};
use stats::Frequencies;
use std::{fs::File, io::Write, ops::Add};

macro_rules! fail {
($mesg:expr) => {
return Err(CliError::Other($mesg));
};
}

static USAGE: &str = "
Infer schmea from CSV data and output in JSON Schema format.

Example output file from `mydata.csv`. If piped from stdin, then filename is `stdin.csv`.

* mydata.csv.schema.json

Usage:
qsv schema [options] [<input>]

schema options:


Common options:
-h, --help Display this message
-n, --no-headers When set, the first row will not be interpreted
as headers. Namely, it will be sorted with the rest
of the rows. Otherwise, the first row will always
appear as the header row in the output.
-d, --delimiter <arg> The field delimiter for reading CSV data.
Must be a single character. [default: ,]
-q, --quiet Don't show progress bars.
";

#[derive(Deserialize, Debug)]
struct Args {
flag_no_headers: bool,
flag_delimiter: Option<Delimiter>,
flag_quiet: bool,
arg_input: Option<String>,
}

pub fn run(argv: &[&str]) -> CliResult<()> {
let args: Args = util::get_args(USAGE, argv)?;

// dbg!(&args);

let rconfig = Config::new(&args.arg_input)
.delimiter(args.flag_delimiter)
.no_headers(args.flag_no_headers);

let mut rdr = rconfig.reader()?;

let headers = rdr.byte_headers()?.clone();

let input_path: &str = &args.arg_input.unwrap_or_else(|| "stdin.csv".to_string());

let mut schema_output_file = File::create(input_path.to_owned() + ".schema.json")
.expect("unable to create schema output file");

// prep progress bar
let progress = ProgressBar::new(0);
if !args.flag_quiet {
let record_count = util::count_rows(&rconfig.flexible(true));
util::prep_progress(&progress, record_count);
} else {
progress.set_draw_target(ProgressDrawTarget::hidden());
}

// amortize memory allocation by reusing record
#[allow(unused_assignments)]
let mut record = ByteRecord::new();

let mut row_index: u32 = 0;

// array of frequency tables to track non-NULL type occurrences
let mut frequency_tables: Vec<_> = (0..(headers.len() as u32)).map(|_| Frequencies::<FieldType>::new()).collect();
// array of boolean to track if column is NULLABLE
let mut nullable_flags: Vec<bool> = vec![false; headers.len()];

// iterate over each CSV field and determine type
let headers_iter = headers.iter().enumerate();

while rdr.read_byte_record(&mut record)? {
row_index = row_index.add(1);

// dbg!(&record);

for col_index in 0..headers.len() {

// since from_sample() parses byte slice to string, no need to do it here
let value_slice: &[u8] = &record[col_index];

let inferred_type: FieldType = FieldType::from_sample(value_slice);

debug!("column_{col_index}[{row_index}]: val={value_slice:?}, type={inferred_type}");

// update frequency table for this column
match inferred_type {
FieldType::TNull => {

// only count NULL once, so it won't dominate frequency table when value is optional
if nullable_flags[col_index] == false {
frequency_tables[col_index].add(FieldType::TNull);
}
nullable_flags[col_index] = true;

}
FieldType::TUnknown => {
// default to String
frequency_tables[col_index].add(FieldType::TUnicode);
}
x => {
frequency_tables[col_index].add(x);
}
}

}

if !args.flag_quiet {
progress.inc(1);
}
} // end main while loop over csv records

use thousands::Separable;

if !args.flag_quiet {
progress.set_message(format!(
" processed {} records.",
progress.length().separate_with_commas()
));
util::finish_progress(&progress);
}

debug!("freq tables: {frequency_tables:?}");

// map holds "properties" object of json schema
let mut properties_map: Map<String, Value> = Map::new();

// iterate through headers again and get most frequent type for each column
for (i, header) in headers_iter {
let most_frequent = frequency_tables[i].most_frequent();
let (inferred_type, count) = match most_frequent.get(0) {
Some(tuple) => *tuple,
None => {
// not good, no type info for this column
let msg = format!("Cannot determine type for column '{header:?}'");
error!("{msg}");
fail!(msg);
}
};

// convert csv header to string
let header_string: String = match std::str::from_utf8(header){
Ok(s) => {
s.to_string()
},
Err(e) => {
let msg = format!("Can't read header from column {i} as utf8: {e}");
error!("{msg}");
fail!(msg);
}
};

let required: bool = if *inferred_type != FieldType::TNull &&
*inferred_type != FieldType::TUnknown &&
count as u32 >= row_index {
true
} else {
false
};

debug!("{header_string} has most frequent type of {inferred_type:?}, required={required}");

// use list since optional columns get appended a "null" type
let mut type_list: Vec<Value> = Vec::new();

match inferred_type {
FieldType::TUnicode => {
type_list.push(Value::String("string".to_string()));
},
FieldType::TDate => {
type_list.push(Value::String("string".to_string()));
},
FieldType::TInteger => {
type_list.push(Value::String("integer".to_string()));
},
FieldType::TFloat => {
type_list.push(Value::String("number".to_string()));
},
FieldType::TNull => {
type_list.push(Value::String("null".to_string()));
},
_ => {
// defaults to JSON String
type_list.push(Value::String("string".to_string()));
},
}

// "null" type denotes optinal value
// to be compatible with "validate" command, has to come after the real type, and only if type is not already JSON Null
if !required && *inferred_type != FieldType::TNull {
type_list.push(Value::String("null".to_string()));
}

let mut field_map: Map<String, Value> = Map::new();
field_map.insert("type".to_string(), Value::Array(type_list));
properties_map.insert(header_string, Value::Object(field_map));

} // end for loop over all columns

// create final JSON object for output
let schema = json!({
"$schema": "https://json-schema.org/draft-07/schema",
"title": format!("JSON Schema for {input_path}"),
"description": "Inferred JSON Schema from QSV schema command",
"type": "object",
"properties": Value::Object(properties_map)
});

let schema_pretty = serde_json::to_string_pretty(&schema).expect("prettify schema json");

schema_output_file
.write_all(schema_pretty.as_bytes())
.expect("unable to write schema file");

// flush error report; file gets closed automagically when out-of-scope
schema_output_file.flush().unwrap();

Ok(())
}


19 changes: 16 additions & 3 deletions src/cmd/stats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -530,8 +530,8 @@ impl Commute for Stats {
}
}

#[derive(Clone, Copy, PartialEq)]
enum FieldType {
#[derive(Clone, Copy, PartialEq, Eq, Hash)]
pub enum FieldType {
TUnknown,
TNull,
TUnicode,
Expand All @@ -541,7 +541,7 @@ enum FieldType {
}

impl FieldType {
fn from_sample(sample: &[u8]) -> FieldType {
pub fn from_sample(sample: &[u8]) -> FieldType {
if sample.is_empty() {
return TNull;
}
Expand Down Expand Up @@ -616,6 +616,19 @@ impl fmt::Display for FieldType {
}
}

impl fmt::Debug for FieldType {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match *self {
TUnknown => write!(f, "Unknown"),
TNull => write!(f, "NULL"),
TUnicode => write!(f, "Unicode"),
TFloat => write!(f, "Float"),
TInteger => write!(f, "Integer"),
TDate => write!(f, "Date"),
}
}
}

/// TypedSum keeps a rolling sum of the data seen.
///
/// It sums integers until it sees a float, at which point it sums floats.
Expand Down
5 changes: 4 additions & 1 deletion src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,10 +67,11 @@ macro_rules! command_list {
partition Partition CSV data based on a column value
pseudo Pseudonymise the values of a column
py* Evaluate a Python expression on CSV data
sample Randomly sample CSV data
rename Rename the columns of CSV data efficiently
replace Replace patterns in CSV data
reverse Reverse rows of CSV data
sample Randomly sample CSV data
schema Infer schema from CSV data
search Search CSV data with a regex
searchset Search CSV data with a regex set
select Select, re-order, duplicate or drop columns
Expand Down Expand Up @@ -251,6 +252,7 @@ enum Command {
Replace,
Reverse,
Sample,
Schema,
Search,
SearchSet,
Select,
Expand Down Expand Up @@ -313,6 +315,7 @@ impl Command {
Command::Replace => cmd::replace::run(argv),
Command::Reverse => cmd::reverse::run(argv),
Command::Sample => cmd::sample::run(argv),
Command::Schema => cmd::schema::run(argv),
Command::Search => cmd::search::run(argv),
Command::SearchSet => cmd::searchset::run(argv),
Command::Select => cmd::select::run(argv),
Expand Down
5 changes: 4 additions & 1 deletion src/mainlite.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,10 +60,11 @@ macro_rules! command_list {
jsonl Convert newline-delimited JSON files to CSV
partition Partition CSV data based on a column value
pseudo Pseudonymise the values of a column
sample Randomly sample CSV data
rename Rename the columns of CSV data efficiently
replace Replace patterns in CSV data
reverse Reverse rows of CSV data
sample Randomly sample CSV data
schema Infer schema from CSV data
search Search CSV data with a regex
searchset Search CSV data with a regex set
select Select, re-order, duplicate or drop columns
Expand Down Expand Up @@ -217,6 +218,7 @@ enum Command {
Replace,
Reverse,
Sample,
Schema,
Search,
SearchSet,
Select,
Expand Down Expand Up @@ -271,6 +273,7 @@ impl Command {
Command::Replace => cmd::replace::run(argv),
Command::Reverse => cmd::reverse::run(argv),
Command::Sample => cmd::sample::run(argv),
Command::Schema => cmd::schema::run(argv),
Command::Search => cmd::search::run(argv),
Command::SearchSet => cmd::searchset::run(argv),
Command::Select => cmd::select::run(argv),
Expand Down