-
Notifications
You must be signed in to change notification settings - Fork 175
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fix(join): joining on different types (#3716)
This PR fixes a few things and also has some QOL changes. Fixes: - Joining on null-type join keys - Joining on empty table (resolves #3071) which turned out to be the above issue - Joining on join keys with different types - Combined column typing (right and outer joins should not just give left column types) QOL: - Combine all the column renaming parameters into a `JoinOptions` type. Reduces the parameters for a bunch of functions and also uses the builder pattern - Rename `keep_join_keys` field to `merge_matching_join_keys` to make behavior more clear
- Loading branch information
1 parent
de5acf5
commit d00e444
Showing
31 changed files
with
652 additions
and
638 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
use common_error::DaftResult; | ||
use daft_core::{prelude::*, utils::supertype::try_get_supertype}; | ||
use indexmap::IndexSet; | ||
|
||
use crate::{deduplicate_expr_names, ExprRef}; | ||
|
||
pub fn get_common_join_cols<'a>( | ||
left_schema: &'a SchemaRef, | ||
right_schema: &'a SchemaRef, | ||
) -> impl Iterator<Item = &'a String> { | ||
left_schema | ||
.fields | ||
.keys() | ||
.filter(|name| right_schema.has_field(name)) | ||
} | ||
|
||
/// Infer the schema of a join operation | ||
pub fn infer_join_schema( | ||
left_schema: &SchemaRef, | ||
right_schema: &SchemaRef, | ||
join_type: JoinType, | ||
) -> DaftResult<SchemaRef> { | ||
if matches!(join_type, JoinType::Anti | JoinType::Semi) { | ||
Ok(left_schema.clone()) | ||
} else { | ||
let common_cols = get_common_join_cols(left_schema, right_schema).collect::<IndexSet<_>>(); | ||
|
||
// common columns, then unique left fields, then unique right fields | ||
let fields = common_cols | ||
.iter() | ||
.map(|name| { | ||
let left_field = left_schema.get_field(name).unwrap(); | ||
let right_field = right_schema.get_field(name).unwrap(); | ||
|
||
Ok(match join_type { | ||
JoinType::Inner => left_field.clone(), | ||
JoinType::Left => left_field.clone(), | ||
JoinType::Right => right_field.clone(), | ||
JoinType::Outer => { | ||
let supertype = try_get_supertype(&left_field.dtype, &right_field.dtype)?; | ||
|
||
Field::new(*name, supertype) | ||
} | ||
JoinType::Anti | JoinType::Semi => unreachable!(), | ||
}) | ||
}) | ||
.chain( | ||
left_schema | ||
.fields | ||
.iter() | ||
.chain(right_schema.fields.iter()) | ||
.filter_map(|(name, field)| { | ||
if common_cols.contains(name) { | ||
None | ||
} else { | ||
Some(field.clone()) | ||
} | ||
}) | ||
.map(Ok), | ||
) | ||
.collect::<DaftResult<_>>()?; | ||
|
||
Ok(Schema::new(fields)?.into()) | ||
} | ||
} | ||
|
||
/// Casts join keys to the same types and make their names unique. | ||
pub fn normalize_join_keys( | ||
left_on: Vec<ExprRef>, | ||
right_on: Vec<ExprRef>, | ||
left_schema: SchemaRef, | ||
right_schema: SchemaRef, | ||
) -> DaftResult<(Vec<ExprRef>, Vec<ExprRef>)> { | ||
let (left_on, right_on) = left_on | ||
.into_iter() | ||
.zip(right_on) | ||
.map(|(mut l, mut r)| { | ||
let l_dtype = l.to_field(&left_schema)?.dtype; | ||
let r_dtype = r.to_field(&right_schema)?.dtype; | ||
|
||
let supertype = try_get_supertype(&l_dtype, &r_dtype)?; | ||
|
||
if l_dtype != supertype { | ||
l = l.cast(&supertype); | ||
} | ||
|
||
if r_dtype != supertype { | ||
r = r.cast(&supertype); | ||
} | ||
|
||
Ok((l, r)) | ||
}) | ||
.collect::<DaftResult<(Vec<_>, Vec<_>)>>()?; | ||
|
||
let left_on = deduplicate_expr_names(&left_on); | ||
let right_on = deduplicate_expr_names(&right_on); | ||
|
||
Ok((left_on, right_on)) | ||
} |
This file was deleted.
Oops, something went wrong.
Oops, something went wrong.