Skip to content

Commit

Permalink
[DOP-21813] Make partitioning_mode=hash tests less flaky
Browse files Browse the repository at this point in the history
  • Loading branch information
dolfinus committed Dec 3, 2024
1 parent 97126d1 commit 2296982
Show file tree
Hide file tree
Showing 5 changed files with 0 additions and 57 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -266,15 +266,6 @@ def test_clickhouse_reader_snapshot_with_partitioning_mode_hash(spark, processin
# So just check that any partition has at least 0 rows
assert table_df.groupBy(spark_partition_id()).count().count() == 3

# 100 rows per 3 partitions -> each partition should contain about ~33 rows,
# with some variance caused by randomness & hash distribution
min_count_per_partition = 10
max_count_per_partition = 55

count_per_partition = table_df.groupBy(spark_partition_id()).count().collect()
for partition in count_per_partition:
assert min_count_per_partition <= partition["count"] <= max_count_per_partition


@pytest.mark.parametrize(
"column",
Expand Down Expand Up @@ -362,15 +353,6 @@ def test_clickhouse_reader_snapshot_with_partitioning_mode_mod_date(spark, proce
# So just check that any partition has at least 0 rows
assert table_df.groupBy(spark_partition_id()).count().count() == 3

# 100 rows per 3 partitions -> each partition should contain about ~33 rows,
# with some variance caused by randomness & hash distribution
min_count_per_partition = 10
max_count_per_partition = 55

count_per_partition = table_df.groupBy(spark_partition_id()).count().collect()
for partition in count_per_partition:
assert min_count_per_partition <= partition["count"] <= max_count_per_partition


@pytest.mark.parametrize(
"column",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -272,16 +272,6 @@ def test_mssql_reader_snapshot_with_partitioning_mode_hash(spark, processing, lo
# So just check that any partition has at least 0 rows
assert table_df.groupBy(spark_partition_id()).count().count() == 3

# 100 rows per 3 partitions -> each partition should contain about ~33 rows,
# with some variance caused by randomness & hash distribution
min_count_per_partition = 10
max_count_per_partition = 55

count_per_partition = table_df.groupBy(spark_partition_id()).count().collect()

for partition in count_per_partition:
assert min_count_per_partition <= partition["count"] <= max_count_per_partition


def test_mssql_reader_snapshot_with_partitioning_mode_mod(spark, processing, load_table_data):
from pyspark.sql.functions import spark_partition_id
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -263,15 +263,6 @@ def test_mysql_reader_snapshot_with_partitioning_mode_hash(spark, processing, lo
# So just check that any partition has at least 0 rows
assert table_df.groupBy(spark_partition_id()).count().count() == 3

# 100 rows per 3 partitions -> each partition should contain about ~33 rows,
# with some variance caused by randomness & hash distribution (+- 50% range is wide enough)
min_count_per_partition = 10
max_count_per_partition = 55

count_per_partition = table_df.groupBy(spark_partition_id()).count().collect()
for partition in count_per_partition:
assert min_count_per_partition <= partition["count"] <= max_count_per_partition


@pytest.mark.parametrize(
"column",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -221,16 +221,6 @@ def test_oracle_reader_snapshot_with_partitioning_mode_hash(spark, processing, l
# So just check that any partition has at least 0 rows
assert table_df.groupBy(spark_partition_id()).count().count() == 3

# 100 rows per 3 partitions -> each partition should contain about ~33 rows,
# with some variance caused by randomness & hash distribution
min_count_per_partition = 10
max_count_per_partition = 55

count_per_partition = table_df.groupBy(spark_partition_id()).count().collect()

for partition in count_per_partition:
assert min_count_per_partition <= partition["count"] <= max_count_per_partition


# Apparently, Oracle supports modulus for text columns type
@pytest.mark.parametrize(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -508,16 +508,6 @@ def test_postgres_reader_snapshot_with_partitioning_mode_hash(spark, processing,
# So just check that any partition has at least 0 rows
assert table_df.groupBy(spark_partition_id()).count().count() == 3

# 100 rows per 3 partitions -> each partition should contain about ~33 rows,
# with some variance caused by randomness & hash distribution
min_count_per_partition = 10
max_count_per_partition = 55

count_per_partition = table_df.groupBy(spark_partition_id()).count().collect()

for partition in count_per_partition:
assert min_count_per_partition <= partition["count"] <= max_count_per_partition


def test_postgres_reader_snapshot_with_partitioning_mode_mod(spark, processing, load_table_data):
from pyspark.sql.functions import spark_partition_id
Expand Down

0 comments on commit 2296982

Please sign in to comment.