diff --git a/be/src/runtime/statistic_result_writer.cpp b/be/src/runtime/statistic_result_writer.cpp index 08a0d4638c012..d099f9a246b5b 100644 --- a/be/src/runtime/statistic_result_writer.cpp +++ b/be/src/runtime/statistic_result_writer.cpp @@ -29,6 +29,7 @@ const int STATISTIC_HISTOGRAM_VERSION = 2; const int DICT_STATISTIC_DATA_VERSION = 101; const int STATISTIC_TABLE_VERSION = 3; const int STATISTIC_BATCH_VERSION = 4; +const int STATISTIC_PARTITION_VERSION = 11; const int STATISTIC_EXTERNAL_VERSION = 5; const int STATISTIC_EXTERNAL_QUERY_VERSION = 6; const int STATISTIC_EXTERNAL_HISTOGRAM_VERSION = 7; @@ -148,6 +149,9 @@ StatusOr StatisticResultWriter::_process_chunk(Chunk* chunk } else if (version == STATISTIC_TABLE_VERSION) { RETURN_IF_ERROR_WITH_WARN(_fill_table_statistic_data(version, result_columns, chunk, result.get()), "Fill table statistic data failed"); + } else if (version == STATISTIC_PARTITION_VERSION) { + RETURN_IF_ERROR_WITH_WARN(_fill_partition_statistic_data(version, result_columns, chunk, result.get()), + "Fill partition statistic data failed"); } else if (version == STATISTIC_BATCH_VERSION) { RETURN_IF_ERROR_WITH_WARN(_fill_full_statistic_data_v4(version, result_columns, chunk, result.get()), "Fill table statistic data failed"); @@ -380,6 +384,45 @@ Status StatisticResultWriter::_fill_full_statistic_data_v4(int version, const Co return Status::OK(); } +Status StatisticResultWriter::_fill_partition_statistic_data(int version, const Columns& columns, const Chunk* chunk, + TFetchDataResult* result) { + /* + SQL: + SELECT cast(" + STATISTIC_PARTITION_VERSION + " as INT), + + `partition_id`, + `column_name`, + hll_cardinality(hll_union(`ndv`)) as distinct_count + */ + + SCOPED_TIMER(_serialize_timer); + + // mapping with Data.thrift.TStatisticData + DCHECK(columns.size() == 4); + + // skip read version + auto partition_id = ColumnViewer(columns[1]); + auto column_name = ColumnViewer(columns[2]); + auto distinct_count = ColumnViewer(columns[3]); + std::vector data_list; + int num_rows = chunk->num_rows(); + + data_list.resize(num_rows); + for (int i = 0; i < num_rows; ++i) { + data_list[i].__set_partitionId(partition_id.value(i)); + data_list[i].__set_columnName(column_name.value(i).to_string()); + data_list[i].__set_countDistinct(distinct_count.value(i)); + } + + result->result_batch.rows.resize(num_rows); + result->result_batch.__set_statistic_version(version); + + ThriftSerializer serializer(true, chunk->memory_usage()); + for (int i = 0; i < num_rows; ++i) { + RETURN_IF_ERROR(serializer.serialize(&data_list[i], &result->result_batch.rows[i])); + } + return Status::OK(); +} + /* FE SQL: SELECT cast(5 as INT), diff --git a/be/src/runtime/statistic_result_writer.h b/be/src/runtime/statistic_result_writer.h index a96d4b2f85a00..7f6c3734b8bd8 100644 --- a/be/src/runtime/statistic_result_writer.h +++ b/be/src/runtime/statistic_result_writer.h @@ -53,6 +53,8 @@ class StatisticResultWriter final : public ResultWriter { Status _fill_table_statistic_data(int version, const Columns& columns, const Chunk* chunk, TFetchDataResult* result); + Status _fill_partition_statistic_data(int version, const Columns& columns, const Chunk* chunk, + TFetchDataResult* result); Status _fill_full_statistic_data_v4(int version, const Columns& columns, const Chunk* chunk, TFetchDataResult* result); diff --git a/fe/fe-core/src/main/java/com/starrocks/qe/SessionVariable.java b/fe/fe-core/src/main/java/com/starrocks/qe/SessionVariable.java index 963b8f9534f47..d2f88ad4d7e58 100644 --- a/fe/fe-core/src/main/java/com/starrocks/qe/SessionVariable.java +++ b/fe/fe-core/src/main/java/com/starrocks/qe/SessionVariable.java @@ -362,6 +362,8 @@ public class SessionVariable implements Serializable, Writable, Cloneable { public static final String CBO_PRUNE_JSON_SUBFIELD_DEPTH = "cbo_prune_json_subfield_depth"; public static final String ENABLE_OPTIMIZER_REWRITE_GROUPINGSETS_TO_UNION_ALL = "enable_rewrite_groupingsets_to_union_all"; + public static final String ENABLE_PARTITION_LEVEL_CARDINALITY_ESTIMATION = + "enable_partition_level_cardinality_estimation"; public static final String CBO_USE_DB_LOCK = "cbo_use_lock_db"; public static final String CBO_PREDICATE_SUBFIELD_PATH = "cbo_enable_predicate_subfield_path"; @@ -1280,6 +1282,9 @@ public static MaterializedViewRewriteMode parse(String str) { @VariableMgr.VarAttr(name = ENABLE_OPTIMIZER_REWRITE_GROUPINGSETS_TO_UNION_ALL) private boolean enableRewriteGroupingSetsToUnionAll = false; + @VariableMgr.VarAttr(name = ENABLE_PARTITION_LEVEL_CARDINALITY_ESTIMATION, flag = VariableMgr.INVISIBLE) + private boolean enablePartitionLevelCardinalityEstimation = true; + // value should be 0~4 // 0 represents automatic selection, and 1, 2, 3, and 4 represent forced selection of AGG of // corresponding stages respectively. However, stages 3 and 4 can only be generated in @@ -3213,6 +3218,14 @@ public void setEnableRewriteGroupingSetsToUnionAll(boolean enableRewriteGrouping this.enableRewriteGroupingSetsToUnionAll = enableRewriteGroupingSetsToUnionAll; } + public boolean isEnablePartitionLevelCardinalityEstimation() { + return enablePartitionLevelCardinalityEstimation; + } + + public void setEnablePartitionLevelCardinalityEstimation(boolean enablePartitionLevelCardinalityEstimation) { + this.enablePartitionLevelCardinalityEstimation = enablePartitionLevelCardinalityEstimation; + } + public void setEnableLowCardinalityOptimize(boolean enableLowCardinalityOptimize) { this.enableLowCardinalityOptimize = enableLowCardinalityOptimize; } diff --git a/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/CachedStatisticStorage.java b/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/CachedStatisticStorage.java index 796581a8b4ce9..973a313b940d7 100644 --- a/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/CachedStatisticStorage.java +++ b/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/CachedStatisticStorage.java @@ -31,6 +31,7 @@ import com.starrocks.connector.statistics.ConnectorTableColumnStats; import com.starrocks.server.GlobalStateMgr; import com.starrocks.statistic.StatisticUtils; +import org.apache.commons.collections4.MapUtils; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -59,13 +60,20 @@ public class CachedStatisticStorage implements StatisticStorage { .executor(statsCacheRefresherExecutor) .buildAsync(new TableStatsCacheLoader()); - AsyncLoadingCache> cachedStatistics = Caffeine.newBuilder() + AsyncLoadingCache> columnStatistics = Caffeine.newBuilder() .expireAfterWrite(Config.statistic_update_interval_sec * 2, TimeUnit.SECONDS) .refreshAfterWrite(Config.statistic_update_interval_sec, TimeUnit.SECONDS) .maximumSize(Config.statistic_cache_columns) .executor(statsCacheRefresherExecutor) .buildAsync(new ColumnBasicStatsCacheLoader()); + AsyncLoadingCache> partitionStatistics = Caffeine.newBuilder() + .expireAfterWrite(Config.statistic_update_interval_sec * 2, TimeUnit.SECONDS) + .refreshAfterWrite(Config.statistic_update_interval_sec, TimeUnit.SECONDS) + .maximumSize(Config.statistic_cache_columns) + .executor(statsCacheRefresherExecutor) + .buildAsync(new PartitionStatsCacheLoader()); + AsyncLoadingCache> connectorTableCachedStatistics = Caffeine.newBuilder().expireAfterWrite(Config.statistic_update_interval_sec * 2, TimeUnit.SECONDS) .refreshAfterWrite(Config.statistic_update_interval_sec, TimeUnit.SECONDS) @@ -249,7 +257,7 @@ public ColumnStatistic getColumnStatistic(Table table, String column) { } try { CompletableFuture> result = - cachedStatistics.get(new ColumnStatsCacheKey(table.getId(), column)); + columnStatistics.get(new ColumnStatsCacheKey(table.getId(), column)); if (result.isDone()) { Optional realResult; realResult = result.get(); @@ -284,7 +292,8 @@ public List getColumnStatistics(Table table, List colum } try { - CompletableFuture>> result = cachedStatistics.getAll(cacheKeys); + CompletableFuture>> result = + columnStatistics.getAll(cacheKeys); if (result.isDone()) { List columnStatistics = new ArrayList<>(); Map> realResult; @@ -328,7 +337,8 @@ public List getColumnStatisticsSync(Table table, List c } try { - Map> result = cachedStatistics.synchronous().getAll(cacheKeys); + Map> result = + columnStatistics.synchronous().getAll(cacheKeys); List columnStatistics = new ArrayList<>(); for (String column : columns) { @@ -347,6 +357,86 @@ public List getColumnStatisticsSync(Table table, List c } } + /** + * + */ + private Map getColumnNDVForPartitions(Table table, List partitions, + List columns) { + + List cacheKeys = new ArrayList<>(); + long tableId = table.getId(); + for (String column : columns) { + cacheKeys.add(new ColumnStatsCacheKey(tableId, column)); + } + + try { + Map> result = + partitionStatistics.synchronous().getAll(cacheKeys); + + Map columnStatistics = Maps.newHashMap(); + for (String column : columns) { + Optional columnStatistic = result.get(new ColumnStatsCacheKey(tableId, column)); + columnStatistics.put(column, columnStatistic.orElse(null)); + } + return columnStatistics; + } catch (Exception e) { + LOG.warn("Get partition NDV fail", e); + return null; + } + } + + /** + * We don't really maintain all statistics for partition, as most of them are not necessary. + * Currently, the only partition-level statistics is DistinctCount, which may differs a lot among partitions + */ + @Override + public Map> getColumnStatisticsOfPartitionLevel(Table table, List partitions, + List columns) { + + Preconditions.checkState(table != null); + + // get Statistics Table column info, just return default column statistics + if (StatisticUtils.statisticTableBlackListCheck(table.getId())) { + return null; + } + if (!StatisticUtils.checkStatisticTableStateNormal()) { + return null; + } + + long tableId = table.getId(); + List cacheKeys = columns.stream() + .map(x -> new ColumnStatsCacheKey(tableId, x)) + .collect(Collectors.toList()); + List columnStatistics = getColumnStatistics(table, columns); + Map columnNDVForPartitions = getColumnNDVForPartitions(table, partitions, columns); + if (MapUtils.isEmpty(columnNDVForPartitions)) { + return null; + } + + Map> result = Maps.newHashMap(); + for (long partition : partitions) { + List newStatistics = Lists.newArrayList(); + for (int i = 0; i < columns.size(); i++) { + ColumnStatistic columnStatistic = columnStatistics.get(i); + PartitionStats partitionStats = columnNDVForPartitions.get(columns.get(i)); + if (partitionStats == null) { + // some of the columns miss statistics + return null; + } + if (!partitionStats.getDistinctCount().containsKey(partition)) { + // some of the partitions miss statistics + return null; + } + double distinctCount = partitionStats.getDistinctCount().get(partition); + ColumnStatistic newStats = ColumnStatistic.buildFrom(columnStatistic) + .setDistinctValuesCount(distinctCount).build(); + newStatistics.add(newStats); + } + result.put(partition, newStatistics); + } + return result; + } + @Override public void expireTableAndColumnStatistics(Table table, List columns) { List tableStatsCacheKeys = Lists.newArrayList(); @@ -363,12 +453,13 @@ public void expireTableAndColumnStatistics(Table table, List columns) { ColumnStatsCacheKey key = new ColumnStatsCacheKey(table.getId(), column); allKeys.add(key); } - cachedStatistics.synchronous().invalidateAll(allKeys); + columnStatistics.synchronous().invalidateAll(allKeys); } @Override public void addColumnStatistic(Table table, String column, ColumnStatistic columnStatistic) { - this.cachedStatistics.synchronous().put(new ColumnStatsCacheKey(table.getId(), column), Optional.of(columnStatistic)); + this.columnStatistics.synchronous() + .put(new ColumnStatsCacheKey(table.getId(), column), Optional.of(columnStatistic)); } @Override diff --git a/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/PartitionStats.java b/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/PartitionStats.java new file mode 100644 index 0000000000000..078488b79436a --- /dev/null +++ b/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/PartitionStats.java @@ -0,0 +1,38 @@ +// Copyright 2021-present StarRocks, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package com.starrocks.sql.optimizer.statistics; + +import com.google.common.collect.Maps; + +import java.util.Map; + +/** + * Partition-level statistics + */ +public class PartitionStats { + public final Map distinctCount; + + public PartitionStats() { + this.distinctCount = Maps.newHashMap(); + } + + public PartitionStats(Map distinctCount) { + this.distinctCount = distinctCount; + } + + public Map getDistinctCount() { + return distinctCount; + } +} diff --git a/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/PartitionStatsCacheLoader.java b/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/PartitionStatsCacheLoader.java new file mode 100644 index 0000000000000..3b53104543d8e --- /dev/null +++ b/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/PartitionStatsCacheLoader.java @@ -0,0 +1,80 @@ +// Copyright 2021-present StarRocks, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package com.starrocks.sql.optimizer.statistics; + +import com.github.benmanes.caffeine.cache.AsyncCacheLoader; +import com.google.common.collect.Lists; +import com.starrocks.qe.ConnectContext; +import com.starrocks.statistic.StatisticExecutor; +import com.starrocks.statistic.StatisticUtils; +import com.starrocks.thrift.TStatisticData; +import org.checkerframework.checker.nullness.qual.NonNull; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.CompletionException; +import java.util.concurrent.Executor; + +public class PartitionStatsCacheLoader implements AsyncCacheLoader> { + + private final StatisticExecutor statisticExecutor = new StatisticExecutor(); + + @Override + public @NonNull CompletableFuture> asyncLoad(@NonNull ColumnStatsCacheKey cacheKey, + @NonNull Executor executor) { + return asyncLoadAll(Lists.newArrayList(cacheKey), executor).thenApply(x -> x.get(cacheKey)); + } + + @Override + public @NonNull CompletableFuture>> + asyncLoadAll(@NonNull Iterable cacheKey, @NonNull Executor executor) { + return CompletableFuture.supplyAsync(() -> { + try { + ConnectContext connectContext = StatisticUtils.buildConnectContext(); + connectContext.setThreadLocalInfo(); + + Map> result = new HashMap<>(); + long tableId = -1; + List columns = Lists.newArrayList(); + for (ColumnStatsCacheKey statsCacheKey : cacheKey) { + columns.add(statsCacheKey.column); + tableId = statsCacheKey.tableId; + } + List statisticData = statisticExecutor.queryPartitionLevelColumnNDV(connectContext, + tableId, Lists.newArrayList(), columns); + for (TStatisticData data : statisticData) { + ColumnStatsCacheKey key = new ColumnStatsCacheKey(tableId, data.columnName); + result.computeIfAbsent(key, (x) -> Optional.of(new PartitionStats())) + .get().getDistinctCount().put(data.partitionId, (double) data.countDistinct); + } + for (ColumnStatsCacheKey key : cacheKey) { + if (!result.containsKey(key)) { + result.put(key, Optional.empty()); + } + } + return result; + } catch (RuntimeException e) { + throw e; + } catch (Exception e) { + throw new CompletionException(e); + } finally { + ConnectContext.remove(); + } + }, executor); + } +} diff --git a/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/StatisticStorage.java b/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/StatisticStorage.java index 214bb18e832eb..a6d9897a5312b 100644 --- a/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/StatisticStorage.java +++ b/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/StatisticStorage.java @@ -51,6 +51,14 @@ default void updatePartitionStatistics(long tableId, long partition, long rows) List getColumnStatistics(Table table, List columns); + /** + * Return partition-level column statistics, it may not exist + */ + default Map> getColumnStatisticsOfPartitionLevel(Table table, List partitions, + List columns) { + return null; + } + default List getColumnStatisticsSync(Table table, List columns) { return getColumnStatistics(table, columns); } diff --git a/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/StatisticsCalcUtils.java b/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/StatisticsCalcUtils.java index a34d309bb76df..6b4370ac6efe8 100644 --- a/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/StatisticsCalcUtils.java +++ b/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/StatisticsCalcUtils.java @@ -14,6 +14,8 @@ package com.starrocks.sql.optimizer.statistics; +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; import com.starrocks.catalog.Column; import com.starrocks.catalog.OlapTable; import com.starrocks.catalog.Partition; @@ -28,6 +30,9 @@ import com.starrocks.statistic.BasicStatsMeta; import com.starrocks.statistic.StatisticUtils; import com.starrocks.statistic.StatsConstants; +import org.apache.commons.collections.CollectionUtils; +import org.apache.commons.collections4.MapUtils; +import org.jetbrains.annotations.Nullable; import java.time.LocalDateTime; import java.util.ArrayList; @@ -78,33 +83,105 @@ public static Statistics.Builder estimateScanColumns(Table table, return builder; } + /** + * Return partition-level statistics if it exists. + * Only return the statistics if all columns and all partitions have the required statistics, otherwise return null + */ + public static Map getPartitionStatistics(Operator node, OlapTable table, + Map columns) { + + // 1. only FULL statistics has partition-level info + BasicStatsMeta basicStatsMeta = + GlobalStateMgr.getCurrentState().getAnalyzeMgr().getTableBasicStatsMeta(table.getId()); + StatsConstants.AnalyzeType analyzeType = basicStatsMeta == null ? null : basicStatsMeta.getType(); + if (analyzeType != StatsConstants.AnalyzeType.FULL) { + return null; + } + List selectedPartitions = getSelectedPartitions(node, table); + if (CollectionUtils.isEmpty(selectedPartitions)) { + return null; + } + List partitionIdList = selectedPartitions.stream().map(Partition::getId).collect(Collectors.toList()); + + // column stats + List columnNames = columns.values().stream().map(Column::getName).collect(Collectors.toList()); + Map> columnStatistics = + GlobalStateMgr.getCurrentState().getStatisticStorage().getColumnStatisticsOfPartitionLevel(table, + partitionIdList, columnNames); + if (MapUtils.isEmpty(columnStatistics)) { + return null; + } + + // partition rows + Map partitionRows = getPartitionRows(table, basicStatsMeta, selectedPartitions); + + Map result = Maps.newHashMap(); + Map columnNameMap = + columns.entrySet().stream().collect(Collectors.toMap(x -> x.getValue().getName(), Map.Entry::getKey)); + for (var entry : columnStatistics.entrySet()) { + Statistics.Builder builder = Statistics.builder(); + for (int i = 0; i < columnNames.size(); i++) { + String columnName = columnNames.get(i); + ColumnStatistic columnStatistic = entry.getValue().get(i); + ColumnRefOperator ref = columnNameMap.get(columnName); + builder.addColumnStatistic(ref, columnStatistic); + } + long partitionRow = partitionRows.get(entry.getKey()); + if (partitionRow == 1) { + builder.setTableRowCountMayInaccurate(true); + } + builder.setOutputRowCount(partitionRow); + result.put(entry.getKey(), builder.build()); + } + return result; + } + public static long getTableRowCount(Table table, Operator node) { return getTableRowCount(table, node, null); } + private static Map getPartitionRows(Table table, BasicStatsMeta basicStatsMeta, + Collection selectedPartitions) { + // The basicStatsMeta.getUpdateRows() interface can get the number of + // loaded rows in the table since the last statistics update. But this number is at the table level. + // So here we can count the number of partitions that have changed since the last statistics update, + // and then evenly distribute the number of updated rows at the table level to the partition boundaries + // The purpose of this is to make the statistics of the number of rows more accurate. + // For example, a large amount of data LOAD may cause the number of rows to change greatly. + // This leads to very inaccurate row counts. + LocalDateTime lastWorkTimestamp = GlobalStateMgr.getCurrentState().getTabletStatMgr().getLastWorkTimestamp(); + long deltaRows = deltaRows(table, basicStatsMeta.getUpdateRows()); + Map> tableStatisticMap = GlobalStateMgr.getCurrentState().getStatisticStorage() + .getTableStatistics(table.getId(), selectedPartitions); + Map result = Maps.newHashMap(); + for (Partition partition : selectedPartitions) { + long partitionRowCount; + Optional tableStatistic = + tableStatisticMap.getOrDefault(partition.getId(), Optional.empty()); + LocalDateTime updateDatetime = StatisticUtils.getPartitionLastUpdateTime(partition); + if (tableStatistic.isEmpty()) { + partitionRowCount = partition.getRowCount(); + if (updateDatetime.isAfter(lastWorkTimestamp)) { + partitionRowCount += deltaRows; + } + } else { + partitionRowCount = tableStatistic.get(); + if (updateDatetime.isAfter(basicStatsMeta.getUpdateTime())) { + partitionRowCount += deltaRows; + } + } + + result.put(partition.getId(), partitionRowCount); + } + return result; + } + public static long getTableRowCount(Table table, Operator node, OptimizerContext optimizerContext) { if (table.isNativeTableOrMaterializedView()) { OlapTable olapTable = (OlapTable) table; - Collection selectedPartitions; - if (node.getOpType() == OperatorType.LOGICAL_BINLOG_SCAN || - node.getOpType() == OperatorType.PHYSICAL_STREAM_SCAN) { + Collection selectedPartitions = getSelectedPartitions(node, olapTable); + if (selectedPartitions == null) { return 1; - } else if (node.isLogical()) { - LogicalOlapScanOperator olapScanOperator = (LogicalOlapScanOperator) node; - if (olapScanOperator.getSelectedPartitionId() == null) { - selectedPartitions = olapScanOperator.getTable().getPartitions(); - } else { - selectedPartitions = olapScanOperator.getSelectedPartitionId().stream().map( - olapTable::getPartition).collect(Collectors.toList()); - } - } else { - PhysicalOlapScanOperator olapScanOperator = (PhysicalOlapScanOperator) node; - if (olapScanOperator.getSelectedPartitionId() == null) { - selectedPartitions = olapScanOperator.getTable().getPartitions(); - } else { - selectedPartitions = olapScanOperator.getSelectedPartitionId().stream().map( - olapTable::getPartition).collect(Collectors.toList()); - } } long rowCount = 0; @@ -113,33 +190,10 @@ public static long getTableRowCount(Table table, Operator node, OptimizerContext StatsConstants.AnalyzeType analyzeType = basicStatsMeta == null ? null : basicStatsMeta.getType(); LocalDateTime lastWorkTimestamp = GlobalStateMgr.getCurrentState().getTabletStatMgr().getLastWorkTimestamp(); if (StatsConstants.AnalyzeType.FULL == analyzeType) { - - // The basicStatsMeta.getUpdateRows() interface can get the number of - // loaded rows in the table since the last statistics update. But this number is at the table level. - // So here we can count the number of partitions that have changed since the last statistics update, - // and then evenly distribute the number of updated rows at the table level to the partition boundaries - // The purpose of this is to make the statistics of the number of rows more accurate. - // For example, a large amount of data LOAD may cause the number of rows to change greatly. - // This leads to very inaccurate row counts. - long deltaRows = deltaRows(table, basicStatsMeta.getUpdateRows()); - Map> tableStatisticMap = GlobalStateMgr.getCurrentState().getStatisticStorage() - .getTableStatistics(table.getId(), selectedPartitions); - for (Partition partition : selectedPartitions) { - long partitionRowCount; - Optional tableStatistic = - tableStatisticMap.getOrDefault(partition.getId(), Optional.empty()); - LocalDateTime updateDatetime = StatisticUtils.getPartitionLastUpdateTime(partition); - if (tableStatistic.isEmpty()) { - partitionRowCount = partition.getRowCount(); - if (updateDatetime.isAfter(lastWorkTimestamp)) { - partitionRowCount += deltaRows; - } - } else { - partitionRowCount = tableStatistic.get(); - if (updateDatetime.isAfter(basicStatsMeta.getUpdateTime())) { - partitionRowCount += deltaRows; - } - } + Map partitionRows = + getPartitionRows(table, basicStatsMeta, selectedPartitions); + for (var partition : selectedPartitions) { + long partitionRowCount = partitionRows.get(partition.getId()); updateQueryDumpInfo(optimizerContext, table, partition.getName(), partitionRowCount); rowCount += partitionRowCount; } @@ -172,6 +226,31 @@ public static long getTableRowCount(Table table, Operator node, OptimizerContext return 1; } + private static @Nullable List getSelectedPartitions(Operator node, OlapTable olapTable) { + List selectedPartitions; + if (node.getOpType() == OperatorType.LOGICAL_BINLOG_SCAN || + node.getOpType() == OperatorType.PHYSICAL_STREAM_SCAN) { + return null; + } else if (node.isLogical()) { + LogicalOlapScanOperator olapScanOperator = (LogicalOlapScanOperator) node; + if (olapScanOperator.getSelectedPartitionId() == null) { + selectedPartitions = Lists.newArrayList(olapScanOperator.getTable().getPartitions()); + } else { + selectedPartitions = olapScanOperator.getSelectedPartitionId().stream().map( + olapTable::getPartition).collect(Collectors.toList()); + } + } else { + PhysicalOlapScanOperator olapScanOperator = (PhysicalOlapScanOperator) node; + if (olapScanOperator.getSelectedPartitionId() == null) { + selectedPartitions = Lists.newArrayList(olapScanOperator.getTable().getPartitions()); + } else { + selectedPartitions = olapScanOperator.getSelectedPartitionId().stream().map( + olapTable::getPartition).collect(Collectors.toList()); + } + } + return selectedPartitions; + } + private static void updateQueryDumpInfo(OptimizerContext optimizerContext, Table table, String partitionName, long rowCount) { if (optimizerContext != null && optimizerContext.getDumpInfo() != null) { diff --git a/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/StatisticsCalculator.java b/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/StatisticsCalculator.java index d04ae5606f8d7..fbfc0aaeb9a16 100644 --- a/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/StatisticsCalculator.java +++ b/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/StatisticsCalculator.java @@ -148,6 +148,7 @@ import com.starrocks.sql.optimizer.rule.transformation.ListPartitionPruner; import com.starrocks.statistic.StatisticUtils; import org.apache.commons.collections4.CollectionUtils; +import org.apache.commons.collections4.MapUtils; import org.apache.commons.lang3.StringUtils; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -219,7 +220,7 @@ public Void visitOperator(Operator node, ExpressionContext context) { predicate = removePartitionPredicate(predicate, node, optimizerContext); Statistics statistics = context.getStatistics(); - if (null != predicate) { + if (null != predicate && !predicate.isNotEvalEstimate()) { statistics = estimateStatistics(ImmutableList.of(predicate), statistics); } @@ -353,18 +354,102 @@ private Void computeOlapScanNode(Operator node, ExpressionContext context, Table OlapTable olapTable = (OlapTable) table; adjustPartitionColsStatistic(selectedPartitionIds, olapTable, builder, colRefToColumnMetaMap); builder.setOutputRowCount(tableRowCount); + + // 4. deal with predicate cardinality for list partition table + adjustPredicateCardinality(olapTable, selectedPartitionIds, node, colRefToColumnMetaMap, builder); + + // 5. Deal with OlapScan which is generated by MV if (isRewrittenMvGE(node, table, context)) { adjustNestedMvStatistics(context.getGroupExpression().getGroup(), (MaterializedView) olapTable, builder); if (node.getProjection() != null) { builder.setShadowColumns(node.getProjection().getOutputColumns()); } } - // 4. estimate cardinality context.setStatistics(builder.build()); return visitOperator(node, context); } + /** + * For LIST partition table, we cannot use the table_rows * predicate_selectivity to estimate the output rows of + * ScanOperator, since the predicate selectivity is calculated with the table-level NDV, which may not be able + * to represent partition-level NDV. + * So here we use SUM(partition_rows * partition predicate_selectivity) to estimate output rows of ScanOperator + */ + private void adjustPredicateCardinality(OlapTable table, + Collection partitions, + Operator node, + Map columnMap, + Statistics.Builder statistics) { + if (optimizerContext != null && + !optimizerContext.getSessionVariable().isEnablePartitionLevelCardinalityEstimation()) { + return; + } + if (node.getPredicate() == null) { + return; + } + PartitionInfo partitionInfo = table.getPartitionInfo(); + if (!partitionInfo.isListPartition() || CollectionUtils.isEmpty(partitions)) { + return; + } + ListPartitionInfo listPartitionInfo = (ListPartitionInfo) partitionInfo; + long totalPartitions = listPartitionInfo.getPartitionIds(false).size(); + double selectedRatio = 1.0 * partitions.size() / totalPartitions; + if (selectedRatio > 0.3 || partitions.size() > 128) { + // if select most partitions, usually the table-level statistics is good enough + return; + } + + ScalarOperator predicate = node.getPredicate(); + Map partitionStatistics = + StatisticsCalcUtils.getPartitionStatistics(node, table, columnMap); + if (MapUtils.isEmpty(partitionStatistics)) { + return; + } + long tableRows = 0; + predicate = removePartitionPredicate(predicate, node, optimizerContext); + for (var entry : partitionStatistics.entrySet()) { + Statistics partitionStat = estimateStatistics(ImmutableList.of(predicate), entry.getValue()); + long partitionSelectedRows = (long) partitionStat.getOutputRowCount(); + if (partitionStat.isTableRowCountMayInaccurate()) { + return; + } + tableRows += partitionSelectedRows; + } + // adjust output rows + predicate.setNotEvalEstimate(true); + statistics.setOutputRowCount(tableRows); + + // adjust output column statistics if possible + // The basic theory is, if selected partitions have similar NDV, we would assume that they are in the same + // domain, so it's reasonable to use the average NDV to represent all of them. + // And also, + for (ColumnRefOperator column : columnMap.keySet()) { + List partitionStats = partitionStatistics.values().stream() + .map(x -> x.getColumnStatistic(column)) + .collect(Collectors.toList()); + if (CollectionUtils.isEmpty(partitionStats) || partitionStats.stream().anyMatch(Objects::isNull)) { + return; + } + double avgDistinctCount = partitionStats.stream() + .mapToDouble(ColumnStatistic::getDistinctValuesCount) + .average().getAsDouble(); + boolean hasSimilarNDV = partitionStats.stream() + .allMatch(x -> withinDelta(x.getDistinctValuesCount(), avgDistinctCount, 0.1)); + if (hasSimilarNDV) { + ColumnStatistic tableStats = statistics.getColumnStatistics(column); + ColumnStatistic newStats = buildFrom(tableStats).setDistinctValuesCount(avgDistinctCount).build(); + statistics.addColumnStatistic(column, newStats); + } + } + } + + private boolean withinDelta(double a, double b, double ratio) { + double delta = Math.abs(a - b); + double actualRatio = Math.max(delta / a, delta / b); + return actualRatio <= ratio; + } + private void adjustNestedMvStatistics(Group group, MaterializedView mv, Statistics.Builder builder) { for (Map.Entry> entry : group.getRelatedMvs().entrySet()) { if (entry.getValue().contains(mv.getId())) { diff --git a/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/TableStatsCacheLoader.java b/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/TableStatsCacheLoader.java index 9f3e402b08522..e4b2215294763 100644 --- a/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/TableStatsCacheLoader.java +++ b/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/TableStatsCacheLoader.java @@ -15,7 +15,7 @@ package com.starrocks.sql.optimizer.statistics; import com.github.benmanes.caffeine.cache.AsyncCacheLoader; -import com.google.api.client.util.Lists; +import com.google.common.collect.Lists; import com.starrocks.common.Config; import com.starrocks.qe.ConnectContext; import com.starrocks.statistic.StatisticExecutor; @@ -37,25 +37,9 @@ public class TableStatsCacheLoader implements AsyncCacheLoader> asyncLoad(@NonNull TableStatsCacheKey cacheKey, @ NonNull Executor executor) { - return CompletableFuture.supplyAsync(() -> { - try { - ConnectContext connectContext = StatisticUtils.buildConnectContext(); - connectContext.setThreadLocalInfo(); - List statisticData = - statisticExecutor.queryTableStats(connectContext, cacheKey.tableId, cacheKey.partitionId); - if (statisticData.isEmpty()) { - return Optional.empty(); - } else { - return Optional.of(statisticData.get(0).rowCount); - } - } catch (RuntimeException e) { - throw e; - } catch (Exception e) { - throw new CompletionException(e); - } finally { - ConnectContext.remove(); - } - }, executor); + + return asyncLoadAll(Lists.newArrayList(cacheKey), executor) + .thenApply(x -> x.get(cacheKey)); } @Override diff --git a/fe/fe-core/src/main/java/com/starrocks/statistic/StatisticExecutor.java b/fe/fe-core/src/main/java/com/starrocks/statistic/StatisticExecutor.java index 0d8b8c06b1f2c..e7a7fd0245510 100644 --- a/fe/fe-core/src/main/java/com/starrocks/statistic/StatisticExecutor.java +++ b/fe/fe-core/src/main/java/com/starrocks/statistic/StatisticExecutor.java @@ -296,6 +296,12 @@ public List queryTableStats(ConnectContext context, Long tableId return executeStatisticDQL(context, sql); } + public List queryPartitionLevelColumnNDV(ConnectContext context, long tableId, + List partitions, List columns) { + String sql = StatisticSQLBuilder.buildQueryPartitionStatisticsSQL(tableId, partitions, columns); + return executeStatisticDQL(context, sql); + } + private static List deserializerStatisticData(List sqlResult) throws TException { List statistics = Lists.newArrayList(); @@ -308,14 +314,7 @@ private static List deserializerStatisticData(List return statistics; } - if (version == StatsConstants.STATISTIC_DATA_VERSION - || version == StatsConstants.STATISTIC_DICT_VERSION - || version == StatsConstants.STATISTIC_HISTOGRAM_VERSION - || version == StatsConstants.STATISTIC_TABLE_VERSION - || version == StatsConstants.STATISTIC_BATCH_VERSION - || version == StatsConstants.STATISTIC_EXTERNAL_VERSION - || version == StatsConstants.STATISTIC_EXTERNAL_QUERY_VERSION - || version == StatsConstants.STATISTIC_EXTERNAL_HISTOGRAM_VERSION) { + if (StatsConstants.STATISTIC_SUPPORTED_VERSION.contains(version)) { TDeserializer deserializer = new TDeserializer(new TCompactProtocol.Factory()); for (TResultBatch resultBatch : sqlResult) { for (ByteBuffer bb : resultBatch.rows) { diff --git a/fe/fe-core/src/main/java/com/starrocks/statistic/StatisticSQLBuilder.java b/fe/fe-core/src/main/java/com/starrocks/statistic/StatisticSQLBuilder.java index 5d95ce2134476..8949d5aab5671 100644 --- a/fe/fe-core/src/main/java/com/starrocks/statistic/StatisticSQLBuilder.java +++ b/fe/fe-core/src/main/java/com/starrocks/statistic/StatisticSQLBuilder.java @@ -19,6 +19,8 @@ import com.google.common.collect.Lists; import com.google.common.collect.Maps; import com.starrocks.catalog.Type; +import org.apache.commons.collections.CollectionUtils; +import org.apache.logging.log4j.util.Strings; import org.apache.velocity.VelocityContext; import org.apache.velocity.app.VelocityEngine; @@ -35,6 +37,7 @@ import static com.starrocks.statistic.StatsConstants.STATISTIC_EXTERNAL_HISTOGRAM_VERSION; import static com.starrocks.statistic.StatsConstants.STATISTIC_EXTERNAL_QUERY_VERSION; import static com.starrocks.statistic.StatsConstants.STATISTIC_HISTOGRAM_VERSION; +import static com.starrocks.statistic.StatsConstants.STATISTIC_PARTITION_VERSION; import static com.starrocks.statistic.StatsConstants.STATISTIC_TABLE_VERSION; public class StatisticSQLBuilder { @@ -44,6 +47,13 @@ public class StatisticSQLBuilder { + " WHERE $predicate" + " GROUP BY partition_id"; + private static final String QUERY_PARTITION_STATISTIC_TEMPLATE = + "SELECT cast(" + STATISTIC_PARTITION_VERSION + " as INT), " + + " `partition_id`, `column_name`, hll_cardinality(hll_union(`ndv`)) as distinct_count" + + " FROM " + FULL_STATISTICS_TABLE_NAME + + " WHERE $predicate" + + " GROUP BY `partition_id`, `column_name`"; + private static final String QUERY_SAMPLE_STATISTIC_TEMPLATE = "SELECT cast(" + STATISTIC_DATA_VERSION + " as INT), update_time, db_id, table_id, column_name," + " row_count, data_size, distinct_count, null_count, max, min" @@ -96,6 +106,20 @@ public static String buildQueryTableStatisticsSQL(Long tableId, List parti return build(context, QUERY_TABLE_STATISTIC_TEMPLATE); } + public static String buildQueryPartitionStatisticsSQL(Long tableId, List partitionIds, List columns) { + VelocityContext context = new VelocityContext(); + String tablePredicate = "table_id=" + tableId; + String partitionPredicate = CollectionUtils.isEmpty(partitionIds) ? "" : + " AND `partition_id` in (" + + partitionIds.stream().map(String::valueOf).collect(Collectors.joining(", ")) + ")"; + String columnPredicate = CollectionUtils.isEmpty(columns) ? "" : + " AND `column_name` in (" + + columns.stream().map(Strings::quote).collect(Collectors.joining(",")) + ")"; + context.put("predicate", tablePredicate + partitionPredicate + columnPredicate); + + return build(context, QUERY_PARTITION_STATISTIC_TEMPLATE); + } + public static String buildQueryTableStatisticsSQL(Long tableId, Long partitionId) { VelocityContext context = new VelocityContext(); context.put("predicate", "table_id = " + tableId + " and partition_id = " + partitionId); diff --git a/fe/fe-core/src/main/java/com/starrocks/statistic/StatsConstants.java b/fe/fe-core/src/main/java/com/starrocks/statistic/StatsConstants.java index 2c54bcd5ca776..8e6cd099ae3c8 100644 --- a/fe/fe-core/src/main/java/com/starrocks/statistic/StatsConstants.java +++ b/fe/fe-core/src/main/java/com/starrocks/statistic/StatsConstants.java @@ -15,6 +15,7 @@ package com.starrocks.statistic; +import com.google.common.collect.ImmutableSet; import com.google.common.collect.Maps; import java.util.Map; @@ -30,6 +31,20 @@ public class StatsConstants { public static final int STATISTIC_EXTERNAL_VERSION = 5; public static final int STATISTIC_EXTERNAL_QUERY_VERSION = 6; public static final int STATISTIC_EXTERNAL_HISTOGRAM_VERSION = 7; + public static final int STATISTIC_PARTITION_VERSION = 11; + + public static final ImmutableSet STATISTIC_SUPPORTED_VERSION = + ImmutableSet.builder() + .add(STATISTIC_DATA_VERSION) + .add(STATISTIC_DICT_VERSION) + .add(STATISTIC_HISTOGRAM_VERSION) + .add(STATISTIC_TABLE_VERSION) + .add(STATISTIC_BATCH_VERSION) + .add(STATISTIC_EXTERNAL_VERSION) + .add(STATISTIC_EXTERNAL_QUERY_VERSION) + .add(STATISTIC_EXTERNAL_HISTOGRAM_VERSION) + .add(STATISTIC_PARTITION_VERSION) + .build(); public static final int STATISTICS_PARTITION_UPDATED_THRESHOLD = 10; public static final String STATISTICS_DB_NAME = "_statistics_"; diff --git a/test/lib/sr_sql_lib.py b/test/lib/sr_sql_lib.py index 51e5059c03023..4a18bcd58c337 100644 --- a/test/lib/sr_sql_lib.py +++ b/test/lib/sr_sql_lib.py @@ -2576,7 +2576,8 @@ def assert_explain_costs_contains(self, query, *expects): sql = "explain costs %s" % query res = self.execute_sql(sql, True) for expect in expects: - tools.assert_true(str(res["result"]).find(expect) > 0, "assert expect %s is not found in plan" % (expect)) + plan_string = "\n".join(item[0] for item in res["result"]) + tools.assert_true(str(res["result"]).find(expect) > 0, "assert expect %s is not found in plan:\n %s" % (expect, plan_string)) def assert_trace_values_contains(self, query, *expects): """ diff --git a/test/sql/test_list_partition/R/test_list_partition_selectivity b/test/sql/test_list_partition/R/test_list_partition_selectivity new file mode 100644 index 0000000000000..dd8178bf659d4 --- /dev/null +++ b/test/sql/test_list_partition/R/test_list_partition_selectivity @@ -0,0 +1,106 @@ +-- name: test_list_partition_selectivity +DROP DATABASE IF EXISTS test_list_partition_selectivity; +-- result: +-- !result +CREATE DATABASE test_list_partition_selectivity; +-- result: +-- !result +USE test_list_partition_selectivity; +-- result: +-- !result +CREATE TABLE partitions_multi_column_1 ( + c1 int NOT NULL, + c2 int NOT NULL, + c3 int +) +PARTITION BY (c1, c2); +-- result: +-- !result +INSERT INTO partitions_multi_column_1 SELECT 0, 0, generate_series FROM TABLE(generate_series(1, 1000)); +-- result: +-- !result +INSERT INTO partitions_multi_column_1 SELECT 0, 1, generate_series FROM TABLE(generate_series(1, 100)); +-- result: +-- !result +INSERT INTO partitions_multi_column_1 SELECT 0, 2, generate_series FROM TABLE(generate_series(1, 10)); +-- result: +-- !result +INSERT INTO partitions_multi_column_1 SELECT 0, 3, generate_series FROM TABLE(generate_series(1, 1)); +-- result: +-- !result +INSERT INTO partitions_multi_column_1 SELECT 1, 0, generate_series FROM TABLE(generate_series(1, 100)); +-- result: +-- !result +INSERT INTO partitions_multi_column_1 SELECT 2, 0, generate_series FROM TABLE(generate_series(1, 100)); +-- result: +-- !result +INSERT INTO partitions_multi_column_1 SELECT 3, 0, generate_series FROM TABLE(generate_series(1, 100)); +-- result: +-- !result +INSERT INTO partitions_multi_column_1 SELECT 4, 0, generate_series FROM TABLE(generate_series(1, 100)); +-- result: +-- !result +ANALYZE FULL TABLE partitions_multi_column_1 WITH SYNC MODE; +-- result: +test_list_partition_selectivity.partitions_multi_column_1 analyze status OK +-- !result +SELECT count(*) FROM partitions_multi_column_1; +-- result: +1511 +-- !result +function: assert_explain_costs_contains('SELECT c2, c3 FROM partitions_multi_column_1 WHERE c1=0 AND c2=0 ', 'cardinality: 1000', 'c3-->[1.0, 1000.0, 0.0, 4.0, 1005.0]') +-- result: +None +-- !result +function: assert_explain_costs_contains('SELECT c2, c3 FROM partitions_multi_column_1 WHERE c1=0 AND c2=1 ', 'cardinality: 100', 'c3-->[1.0, 1000.0, 0.0, 4.0, 100.0]') +-- result: +None +-- !result +function: assert_explain_costs_contains('SELECT c2, c3 FROM partitions_multi_column_1 WHERE c1=0 AND c2=2 ', 'cardinality: 10', 'c3-->[1.0, 1000.0, 0.0, 4.0, 10.0]') +-- result: +None +-- !result +function: assert_explain_costs_contains('SELECT c2, c3 FROM partitions_multi_column_1 WHERE c1=0 AND c2=3 ', 'cardinality: 1', 'c3-->[1.0, 1000.0, 0.0, 4.0, 1005.0]') +-- result: +None +-- !result +function: assert_explain_costs_contains('SELECT c2, c3 FROM partitions_multi_column_1 WHERE c1=1 AND c2=0 ', 'cardinality: 100', 'c3-->[1.0, 1000.0, 0.0, 4.0, 100.0]') +-- result: +None +-- !result +function: assert_explain_costs_contains('SELECT c2, c3 FROM partitions_multi_column_1 WHERE c1=2 AND c2=0 ', 'cardinality: 100', 'c3-->[1.0, 1000.0, 0.0, 4.0, 100.0]') +-- result: +None +-- !result +function: assert_explain_costs_contains('SELECT c2, c3 FROM partitions_multi_column_1 WHERE c1=3 AND c2=0 ', 'cardinality: 100', 'c3-->[1.0, 1000.0, 0.0, 4.0, 100.0]') +-- result: +None +-- !result +function: assert_explain_costs_contains('SELECT c2, c3 FROM partitions_multi_column_1 WHERE c1=4 AND c2=0 ', 'cardinality: 100', 'c3-->[1.0, 1000.0, 0.0, 4.0, 100.0]') +-- result: +None +-- !result +function: assert_explain_costs_contains('SELECT c2, c3 FROM partitions_multi_column_1 WHERE c1=0 ', 'cardinality: 1111', 'c3-->[1.0, 1000.0, 0.0, 4.0, 1005.0]') +-- result: +None +-- !result +function: assert_explain_costs_contains('SELECT c2, c3 FROM partitions_multi_column_1 WHERE c2=0 ', 'cardinality: 1400', 'c3-->[1.0, 1000.0, 0.0, 4.0, 1005.0]') +-- result: +None +-- !result +function: assert_explain_costs_contains('SELECT c2, c3 FROM partitions_multi_column_1 WHERE c1 IN (1,2,3,4) AND c2=0 ', 'cardinality: 400', 'c3-->[1.0, 1000.0, 0.0, 4.0, 1005.0]') +-- result: +None +-- !result +function: assert_explain_costs_contains('SELECT c2, c3 FROM partitions_multi_column_1 WHERE c1 IN (3,4) AND c2=0 ', 'cardinality: 200', 'c3-->[1.0, 1000.0, 0.0, 4.0, 100.0]') +-- result: +None +-- !result +function: assert_explain_costs_contains('SELECT c2, c3 FROM partitions_multi_column_1 WHERE c1 IN (2,4) AND c2=0 ', 'cardinality: 200', 'c3-->[1.0, 1000.0, 0.0, 4.0, 100.0]') +-- result: +None +-- !result +function: assert_explain_costs_contains('SELECT c2, c3 FROM partitions_multi_column_1 WHERE c1 IN (2,4) ', 'cardinality: 200', 'c3-->[1.0, 1000.0, 0.0, 4.0, 100.0]') +-- result: +None +-- !result \ No newline at end of file diff --git a/test/sql/test_list_partition/T/test_list_partition_selectivity b/test/sql/test_list_partition/T/test_list_partition_selectivity new file mode 100644 index 0000000000000..fb65c483101a0 --- /dev/null +++ b/test/sql/test_list_partition/T/test_list_partition_selectivity @@ -0,0 +1,45 @@ +-- name: test_list_partition_selectivity + +DROP DATABASE IF EXISTS test_list_partition_selectivity; +CREATE DATABASE test_list_partition_selectivity; +USE test_list_partition_selectivity; + +CREATE TABLE partitions_multi_column_1 ( + c1 int NOT NULL, + c2 int NOT NULL, + c3 int +) +PARTITION BY (c1, c2); + +INSERT INTO partitions_multi_column_1 SELECT 0, 0, generate_series FROM TABLE(generate_series(1, 1000)); +INSERT INTO partitions_multi_column_1 SELECT 0, 1, generate_series FROM TABLE(generate_series(1, 100)); +INSERT INTO partitions_multi_column_1 SELECT 0, 2, generate_series FROM TABLE(generate_series(1, 10)); +INSERT INTO partitions_multi_column_1 SELECT 0, 3, generate_series FROM TABLE(generate_series(1, 1)); +INSERT INTO partitions_multi_column_1 SELECT 1, 0, generate_series FROM TABLE(generate_series(1, 100)); +INSERT INTO partitions_multi_column_1 SELECT 2, 0, generate_series FROM TABLE(generate_series(1, 100)); +INSERT INTO partitions_multi_column_1 SELECT 3, 0, generate_series FROM TABLE(generate_series(1, 100)); +INSERT INTO partitions_multi_column_1 SELECT 4, 0, generate_series FROM TABLE(generate_series(1, 100)); + +ANALYZE FULL TABLE partitions_multi_column_1 WITH SYNC MODE; + +SELECT count(*) FROM partitions_multi_column_1; + +-- select only 1 partition +function: assert_explain_costs_contains('SELECT c2, c3 FROM partitions_multi_column_1 WHERE c1=0 AND c2=0 ', 'cardinality: 1000', 'c3-->[1.0, 1000.0, 0.0, 4.0, 1005.0]') +function: assert_explain_costs_contains('SELECT c2, c3 FROM partitions_multi_column_1 WHERE c1=0 AND c2=1 ', 'cardinality: 100', 'c3-->[1.0, 1000.0, 0.0, 4.0, 100.0]') +function: assert_explain_costs_contains('SELECT c2, c3 FROM partitions_multi_column_1 WHERE c1=0 AND c2=2 ', 'cardinality: 10', 'c3-->[1.0, 1000.0, 0.0, 4.0, 10.0]') +function: assert_explain_costs_contains('SELECT c2, c3 FROM partitions_multi_column_1 WHERE c1=0 AND c2=3 ', 'cardinality: 1', 'c3-->[1.0, 1000.0, 0.0, 4.0, 1005.0]') +function: assert_explain_costs_contains('SELECT c2, c3 FROM partitions_multi_column_1 WHERE c1=1 AND c2=0 ', 'cardinality: 100', 'c3-->[1.0, 1000.0, 0.0, 4.0, 100.0]') +function: assert_explain_costs_contains('SELECT c2, c3 FROM partitions_multi_column_1 WHERE c1=2 AND c2=0 ', 'cardinality: 100', 'c3-->[1.0, 1000.0, 0.0, 4.0, 100.0]') +function: assert_explain_costs_contains('SELECT c2, c3 FROM partitions_multi_column_1 WHERE c1=3 AND c2=0 ', 'cardinality: 100', 'c3-->[1.0, 1000.0, 0.0, 4.0, 100.0]') +function: assert_explain_costs_contains('SELECT c2, c3 FROM partitions_multi_column_1 WHERE c1=4 AND c2=0 ', 'cardinality: 100', 'c3-->[1.0, 1000.0, 0.0, 4.0, 100.0]') + +-- select multiple partitions with diverse NDV +function: assert_explain_costs_contains('SELECT c2, c3 FROM partitions_multi_column_1 WHERE c1=0 ', 'cardinality: 1111', 'c3-->[1.0, 1000.0, 0.0, 4.0, 1005.0]') +function: assert_explain_costs_contains('SELECT c2, c3 FROM partitions_multi_column_1 WHERE c2=0 ', 'cardinality: 1400', 'c3-->[1.0, 1000.0, 0.0, 4.0, 1005.0]') +function: assert_explain_costs_contains('SELECT c2, c3 FROM partitions_multi_column_1 WHERE c1 IN (1,2,3,4) AND c2=0 ', 'cardinality: 400', 'c3-->[1.0, 1000.0, 0.0, 4.0, 1005.0]') + +-- select multiple partitions with similar NDV +function: assert_explain_costs_contains('SELECT c2, c3 FROM partitions_multi_column_1 WHERE c1 IN (3,4) AND c2=0 ', 'cardinality: 200', 'c3-->[1.0, 1000.0, 0.0, 4.0, 100.0]') +function: assert_explain_costs_contains('SELECT c2, c3 FROM partitions_multi_column_1 WHERE c1 IN (2,4) AND c2=0 ', 'cardinality: 200', 'c3-->[1.0, 1000.0, 0.0, 4.0, 100.0]') +function: assert_explain_costs_contains('SELECT c2, c3 FROM partitions_multi_column_1 WHERE c1 IN (2,4) ', 'cardinality: 200', 'c3-->[1.0, 1000.0, 0.0, 4.0, 100.0]') \ No newline at end of file