Skip to content

Commit

Permalink
feat: update vcf table options
Browse files Browse the repository at this point in the history
  • Loading branch information
tshauck committed Apr 24, 2024
1 parent 0cb3dd6 commit c17d419
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 18 deletions.
7 changes: 6 additions & 1 deletion exon/exon-core/src/datasources/exon_listing_table_factory.rs
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,10 @@ impl ExonListingTableFactory {
}
ExonFileType::VCF => {
let vcf_options = ListingVCFTableOptions::new(file_compression_type, false)
.with_table_partition_cols(table_partition_cols);
.with_table_partition_cols(table_partition_cols)
.with_parse_info(exon_config_extension.vcf_parse_info)
.with_parse_formats(exon_config_extension.vcf_parse_formats);

let table_schema = vcf_options.infer_schema(state, &table_path).await?;

let config = ExonListingConfig::new_with_options(table_path, vcf_options);
Expand All @@ -213,6 +216,8 @@ impl ExonListingTableFactory {
}
ExonFileType::IndexedVCF => {
let vcf_options = ListingVCFTableOptions::new(file_compression_type, true)
.with_parse_info(exon_config_extension.vcf_parse_info)
.with_parse_formats(exon_config_extension.vcf_parse_formats)
.with_table_partition_cols(table_partition_cols);

let table_schema = vcf_options.infer_schema(state, &table_path).await?;
Expand Down
39 changes: 26 additions & 13 deletions exon/exon-core/src/datasources/vcf/table_provider.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ use object_store::{ObjectMeta, ObjectStore};
use tokio_util::io::StreamReader;

use crate::{
config::ExonConfigExtension,
datasources::{
exon_listing_table_options::{
ExonIndexedListingOptions, ExonListingConfig, ExonListingOptions,
Expand Down Expand Up @@ -71,6 +70,12 @@ pub struct ListingVCFTableOptions {

/// A list of table partition columns
table_partition_cols: Vec<Field>,

/// Whether to parse the INFO field
parse_info: bool,

/// Whether to parse the FORMAT field
parse_formats: bool,
}

impl Default for ListingVCFTableOptions {
Expand All @@ -81,6 +86,8 @@ impl Default for ListingVCFTableOptions {
regions: Vec::new(),
file_compression_type: FileCompressionType::UNCOMPRESSED,
table_partition_cols: Vec::new(),
parse_info: false,
parse_formats: false,
}
}
}
Expand Down Expand Up @@ -141,6 +148,8 @@ impl ListingVCFTableOptions {
indexed,
table_partition_cols: Vec::new(),
regions: Vec::new(),
parse_info: false,
parse_formats: false,
}
}

Expand Down Expand Up @@ -169,9 +178,22 @@ impl ListingVCFTableOptions {
}
}

/// Set the parse info field
pub fn with_parse_info(self, parse_info: bool) -> Self {
Self { parse_info, ..self }
}

/// Set the parse formats field
pub fn with_parse_formats(self, parse_formats: bool) -> Self {
Self {
parse_formats,
..self
}
}

async fn infer_schema_from_object_meta(
&self,
state: &SessionState,
_state: &SessionState,
store: &Arc<dyn ObjectStore>,
objects: &[ObjectMeta],
) -> datafusion::error::Result<TableSchema> {
Expand All @@ -186,18 +208,9 @@ impl ListingVCFTableOptions {
let stream_reader = Box::pin(get_result.into_stream().map_err(DataFusionError::from));
let stream_reader = StreamReader::new(stream_reader);

let exon_settings = state
.config()
.options()
.extensions
.get::<ExonConfigExtension>()
.ok_or(DataFusionError::Execution(
"Exon settings must be configured.".to_string(),
))?;

let mut builder = VCFSchemaBuilder::default()
.with_parse_info(exon_settings.vcf_parse_info)
.with_parse_formats(exon_settings.vcf_parse_formats)
.with_parse_info(self.parse_info)
.with_parse_formats(self.parse_formats)
.with_partition_fields(self.table_partition_cols.clone());

let header = match self.file_compression_type {
Expand Down
19 changes: 15 additions & 4 deletions exon/exon-core/src/datasources/vcf/udtf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
use std::sync::Arc;

use crate::{
config::extract_config_from_state,
datasources::{exon_listing_table_options::ExonListingConfig, ScanFunction},
error::ExonError,
};
Expand Down Expand Up @@ -48,12 +49,17 @@ impl TableFunctionImpl for VCFScanFunction {
fn call(&self, exprs: &[Expr]) -> Result<Arc<dyn TableProvider>> {
let listing_scan_function = ScanFunction::try_from(exprs)?;

let state = self.ctx.state();
let exon_config_extension = extract_config_from_state(&state)?;

let listing_table_options =
ListingVCFTableOptions::new(listing_scan_function.file_compression_type, false);
ListingVCFTableOptions::new(listing_scan_function.file_compression_type, false)
.with_parse_formats(exon_config_extension.vcf_parse_formats)
.with_parse_info(exon_config_extension.vcf_parse_info);

let schema = futures::executor::block_on(async {
let schema = listing_table_options
.infer_schema(&self.ctx.state(), &listing_scan_function.listing_table_url)
.infer_schema(&state, &listing_scan_function.listing_table_url)
.await?;

Ok::<TableSchema, datafusion::error::DataFusionError>(schema)
Expand Down Expand Up @@ -100,8 +106,13 @@ impl TableFunctionImpl for VCFIndexedScanFunction {

let region = region_str.parse().map_err(ExonError::from)?;

let listing_table_options =
ListingVCFTableOptions::new(FileCompressionType::GZIP, true).with_regions(vec![region]);
let state = self.ctx.state();
let exon_config_extension = extract_config_from_state(&state)?;

let listing_table_options = ListingVCFTableOptions::new(FileCompressionType::GZIP, true)
.with_regions(vec![region])
.with_parse_info(exon_config_extension.vcf_parse_info)
.with_parse_formats(exon_config_extension.vcf_parse_formats);

let schema = futures::executor::block_on(async {
let schema = listing_table_options
Expand Down

0 comments on commit c17d419

Please sign in to comment.