[DOP-21813] Make partitioning_mode=hash tests less flaky

MobileTeleSystems · Dec 3, 2024 · 2296982 · 2296982
1 parent 97126d1
commit 2296982
Show file tree

Hide file tree

Showing 5 changed files with 0 additions and 57 deletions.
diff --git a/.../tests_core_integration/tests_db_reader_integration/test_clickhouse_reader_integration.py b/.../tests_core_integration/tests_db_reader_integration/test_clickhouse_reader_integration.py
@@ -266,15 +266,6 @@ def test_clickhouse_reader_snapshot_with_partitioning_mode_hash(spark, processin
     # So just check that any partition has at least 0 rows
     assert table_df.groupBy(spark_partition_id()).count().count() == 3
 
-    # 100 rows per 3 partitions -> each partition should contain about ~33 rows,
-    # with some variance caused by randomness & hash distribution
-    min_count_per_partition = 10
-    max_count_per_partition = 55
-
-    count_per_partition = table_df.groupBy(spark_partition_id()).count().collect()
-    for partition in count_per_partition:
-        assert min_count_per_partition <= partition["count"] <= max_count_per_partition
-
 
 @pytest.mark.parametrize(
     "column",
@@ -362,15 +353,6 @@ def test_clickhouse_reader_snapshot_with_partitioning_mode_mod_date(spark, proce
     # So just check that any partition has at least 0 rows
     assert table_df.groupBy(spark_partition_id()).count().count() == 3
 
-    # 100 rows per 3 partitions -> each partition should contain about ~33 rows,
-    # with some variance caused by randomness & hash distribution
-    min_count_per_partition = 10
-    max_count_per_partition = 55
-
-    count_per_partition = table_df.groupBy(spark_partition_id()).count().collect()
-    for partition in count_per_partition:
-        assert min_count_per_partition <= partition["count"] <= max_count_per_partition
-
 
 @pytest.mark.parametrize(
     "column",

diff --git a/...ation/tests_core_integration/tests_db_reader_integration/test_mssql_reader_integration.py b/...ation/tests_core_integration/tests_db_reader_integration/test_mssql_reader_integration.py
@@ -272,16 +272,6 @@ def test_mssql_reader_snapshot_with_partitioning_mode_hash(spark, processing, lo
     # So just check that any partition has at least 0 rows
     assert table_df.groupBy(spark_partition_id()).count().count() == 3
 
-    # 100 rows per 3 partitions -> each partition should contain about ~33 rows,
-    # with some variance caused by randomness & hash distribution
-    min_count_per_partition = 10
-    max_count_per_partition = 55
-
-    count_per_partition = table_df.groupBy(spark_partition_id()).count().collect()
-
-    for partition in count_per_partition:
-        assert min_count_per_partition <= partition["count"] <= max_count_per_partition
-
 
 def test_mssql_reader_snapshot_with_partitioning_mode_mod(spark, processing, load_table_data):
     from pyspark.sql.functions import spark_partition_id

diff --git a/...ation/tests_core_integration/tests_db_reader_integration/test_mysql_reader_integration.py b/...ation/tests_core_integration/tests_db_reader_integration/test_mysql_reader_integration.py
@@ -263,15 +263,6 @@ def test_mysql_reader_snapshot_with_partitioning_mode_hash(spark, processing, lo
     # So just check that any partition has at least 0 rows
     assert table_df.groupBy(spark_partition_id()).count().count() == 3
 
-    # 100 rows per 3 partitions -> each partition should contain about ~33 rows,
-    # with some variance caused by randomness & hash distribution (+- 50% range is wide enough)
-    min_count_per_partition = 10
-    max_count_per_partition = 55
-
-    count_per_partition = table_df.groupBy(spark_partition_id()).count().collect()
-    for partition in count_per_partition:
-        assert min_count_per_partition <= partition["count"] <= max_count_per_partition
-
 
 @pytest.mark.parametrize(
     "column",

diff --git a/...tion/tests_core_integration/tests_db_reader_integration/test_oracle_reader_integration.py b/...tion/tests_core_integration/tests_db_reader_integration/test_oracle_reader_integration.py
@@ -221,16 +221,6 @@ def test_oracle_reader_snapshot_with_partitioning_mode_hash(spark, processing, l
     # So just check that any partition has at least 0 rows
     assert table_df.groupBy(spark_partition_id()).count().count() == 3
 
-    # 100 rows per 3 partitions -> each partition should contain about ~33 rows,
-    # with some variance caused by randomness & hash distribution
-    min_count_per_partition = 10
-    max_count_per_partition = 55
-
-    count_per_partition = table_df.groupBy(spark_partition_id()).count().collect()
-
-    for partition in count_per_partition:
-        assert min_count_per_partition <= partition["count"] <= max_count_per_partition
-
 
 # Apparently, Oracle supports modulus for text columns type
 @pytest.mark.parametrize(

diff --git a/...on/tests_core_integration/tests_db_reader_integration/test_postgres_reader_integration.py b/...on/tests_core_integration/tests_db_reader_integration/test_postgres_reader_integration.py
@@ -508,16 +508,6 @@ def test_postgres_reader_snapshot_with_partitioning_mode_hash(spark, processing,
     # So just check that any partition has at least 0 rows
     assert table_df.groupBy(spark_partition_id()).count().count() == 3
 
-    # 100 rows per 3 partitions -> each partition should contain about ~33 rows,
-    # with some variance caused by randomness & hash distribution
-    min_count_per_partition = 10
-    max_count_per_partition = 55
-
-    count_per_partition = table_df.groupBy(spark_partition_id()).count().collect()
-
-    for partition in count_per_partition:
-        assert min_count_per_partition <= partition["count"] <= max_count_per_partition
-
 
 def test_postgres_reader_snapshot_with_partitioning_mode_mod(spark, processing, load_table_data):
     from pyspark.sql.functions import spark_partition_id