From f8f2c058ae07eb89e138cc097837c919ac8c9398 Mon Sep 17 00:00:00 2001 From: tpp Date: Fri, 25 Oct 2024 09:14:31 -0700 Subject: [PATCH 1/6] planner: Use modifyCount when all topN collected --- pkg/planner/cardinality/row_count_column.go | 2 +- pkg/planner/cardinality/row_count_index.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/planner/cardinality/row_count_column.go b/pkg/planner/cardinality/row_count_column.go index 5e6d75bf19a74..a28e636922ecd 100644 --- a/pkg/planner/cardinality/row_count_column.go +++ b/pkg/planner/cardinality/row_count_column.go @@ -177,7 +177,7 @@ func equalRowCountOnColumn(sctx planctx.PlanContext, c *statistics.Column, val t if modifyCount == 0 { return 0, nil } - return 1, nil + return max(1, float64(modifyCount)/float64(c.Histogram.NDV)), nil } return c.Histogram.NotNullCount() / histNDV, nil } diff --git a/pkg/planner/cardinality/row_count_index.go b/pkg/planner/cardinality/row_count_index.go index 4393bf3b4552d..5f1197bdd9522 100644 --- a/pkg/planner/cardinality/row_count_index.go +++ b/pkg/planner/cardinality/row_count_index.go @@ -419,7 +419,7 @@ func equalRowCountOnIndex(sctx planctx.PlanContext, idx *statistics.Index, b []b if modifyCount == 0 { return 0 } - return 1 + return max(1, float64(modifyCount)/float64(idx.Histogram.NDV)) } return idx.Histogram.NotNullCount() / histNDV } From 4a63d1f290e60c0776e8748bdd9ed3f00c3b3ec7 Mon Sep 17 00:00:00 2001 From: tpp Date: Mon, 28 Oct 2024 08:58:40 -0700 Subject: [PATCH 2/6] review comments --- pkg/planner/cardinality/row_count_column.go | 12 +++++++++++- pkg/planner/cardinality/row_count_index.go | 10 +++++++++- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/pkg/planner/cardinality/row_count_column.go b/pkg/planner/cardinality/row_count_column.go index a28e636922ecd..e2a88f41447f7 100644 --- a/pkg/planner/cardinality/row_count_column.go +++ b/pkg/planner/cardinality/row_count_column.go @@ -15,6 +15,8 @@ package cardinality import ( + "math" + "github.com/pingcap/errors" "github.com/pingcap/tidb/pkg/planner/planctx" "github.com/pingcap/tidb/pkg/planner/util/debugtrace" @@ -177,7 +179,15 @@ func equalRowCountOnColumn(sctx planctx.PlanContext, c *statistics.Column, val t if modifyCount == 0 { return 0, nil } - return max(1, float64(modifyCount)/float64(c.Histogram.NDV)), nil + // Use sqrt to create an NDV if one doesn't already exist, or reset to the original NDV + if c.Histogram.NDV <= 0 { + histNDV = math.Sqrt(max(c.Histogram.NotNullCount(), float64(modifyCount))) + } else { + histNDV = float64(c.Histogram.NDV) + } + // Return the min of the original notNullCount or the modifyCount/NDV. This is to reduce the + // risk of too large or too small an estimate if modifyCount is large or NotNullCount is small + return max(1, min(c.Histogram.NotNullCount(), float64(modifyCount)/histNDV)), nil } return c.Histogram.NotNullCount() / histNDV, nil } diff --git a/pkg/planner/cardinality/row_count_index.go b/pkg/planner/cardinality/row_count_index.go index 5f1197bdd9522..a0951d762c875 100644 --- a/pkg/planner/cardinality/row_count_index.go +++ b/pkg/planner/cardinality/row_count_index.go @@ -419,7 +419,15 @@ func equalRowCountOnIndex(sctx planctx.PlanContext, idx *statistics.Index, b []b if modifyCount == 0 { return 0 } - return max(1, float64(modifyCount)/float64(idx.Histogram.NDV)) + // Use sqrt to create an NDV if one doesn't already exist, or reset to the original NDV + if idx.Histogram.NDV <= 0 { + histNDV = math.Sqrt(max(idx.Histogram.NotNullCount(), float64(modifyCount))) + } else { + histNDV = float64(idx.Histogram.NDV) + } + // Return the min of the original notNullCount or the modifyCount/NDV. This is to reduce the + // risk of too large or too small an estimate if modifyCount is large or NotNullCount is small + return max(1, min(idx.Histogram.NotNullCount(), float64(modifyCount)/histNDV)) } return idx.Histogram.NotNullCount() / histNDV } From 4b5cf7029a3ba6876044b8e02efe621dd1a2c286 Mon Sep 17 00:00:00 2001 From: tpp Date: Fri, 1 Nov 2024 08:20:20 -0700 Subject: [PATCH 3/6] reverse IF statement to be NDV > 0 --- pkg/planner/cardinality/row_count_column.go | 8 ++++---- pkg/planner/cardinality/row_count_index.go | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pkg/planner/cardinality/row_count_column.go b/pkg/planner/cardinality/row_count_column.go index e2a88f41447f7..48c549b42ae31 100644 --- a/pkg/planner/cardinality/row_count_column.go +++ b/pkg/planner/cardinality/row_count_column.go @@ -179,11 +179,11 @@ func equalRowCountOnColumn(sctx planctx.PlanContext, c *statistics.Column, val t if modifyCount == 0 { return 0, nil } - // Use sqrt to create an NDV if one doesn't already exist, or reset to the original NDV - if c.Histogram.NDV <= 0 { - histNDV = math.Sqrt(max(c.Histogram.NotNullCount(), float64(modifyCount))) - } else { + // Reset to the original NDV, or if no NDV - derive an NDV using sqrt + if c.Histogram.NDV > 0 { histNDV = float64(c.Histogram.NDV) + } else { + histNDV = math.Sqrt(max(c.Histogram.NotNullCount(), float64(modifyCount))) } // Return the min of the original notNullCount or the modifyCount/NDV. This is to reduce the // risk of too large or too small an estimate if modifyCount is large or NotNullCount is small diff --git a/pkg/planner/cardinality/row_count_index.go b/pkg/planner/cardinality/row_count_index.go index a0951d762c875..3a6b1e018e9e8 100644 --- a/pkg/planner/cardinality/row_count_index.go +++ b/pkg/planner/cardinality/row_count_index.go @@ -419,11 +419,11 @@ func equalRowCountOnIndex(sctx planctx.PlanContext, idx *statistics.Index, b []b if modifyCount == 0 { return 0 } - // Use sqrt to create an NDV if one doesn't already exist, or reset to the original NDV - if idx.Histogram.NDV <= 0 { - histNDV = math.Sqrt(max(idx.Histogram.NotNullCount(), float64(modifyCount))) - } else { + // Reset to the original NDV, or if no NDV - derive an NDV using sqrt + if idx.Histogram.NDV > 0 { histNDV = float64(idx.Histogram.NDV) + } else { + histNDV = math.Sqrt(max(idx.Histogram.NotNullCount(), float64(modifyCount))) } // Return the min of the original notNullCount or the modifyCount/NDV. This is to reduce the // risk of too large or too small an estimate if modifyCount is large or NotNullCount is small From 4de7e0eb702395776b018609aaaea8f72b05b1fe Mon Sep 17 00:00:00 2001 From: tpp Date: Wed, 6 Nov 2024 13:50:17 -0800 Subject: [PATCH 4/6] create new testcase --- pkg/planner/cardinality/row_count_column.go | 14 ++++-- pkg/planner/cardinality/row_count_index.go | 8 ++-- pkg/planner/cardinality/selectivity_test.go | 51 +++++++++++++++++++++ 3 files changed, 64 insertions(+), 9 deletions(-) diff --git a/pkg/planner/cardinality/row_count_column.go b/pkg/planner/cardinality/row_count_column.go index 48c549b42ae31..682f8e01667c5 100644 --- a/pkg/planner/cardinality/row_count_column.go +++ b/pkg/planner/cardinality/row_count_column.go @@ -175,20 +175,24 @@ func equalRowCountOnColumn(sctx planctx.PlanContext, c *statistics.Column, val t // 3. use uniform distribution assumption for the rest (even when this value is not covered by the range of stats) histNDV := float64(c.Histogram.NDV - int64(c.TopN.Num())) if histNDV <= 0 { - // If the table hasn't been modified, it's safe to return 0. Otherwise, the TopN could be stale - return 1. + // If the table hasn't been modified, it's safe to return 0. if modifyCount == 0 { return 0, nil } + // ELSE calculate an approximate estimate based upon newly inserted rows. + // // Reset to the original NDV, or if no NDV - derive an NDV using sqrt if c.Histogram.NDV > 0 { histNDV = float64(c.Histogram.NDV) } else { - histNDV = math.Sqrt(max(c.Histogram.NotNullCount(), float64(modifyCount))) + histNDV = math.Sqrt(max(c.NotNullCount(), float64(realtimeRowCount))) } - // Return the min of the original notNullCount or the modifyCount/NDV. This is to reduce the - // risk of too large or too small an estimate if modifyCount is large or NotNullCount is small - return max(1, min(c.Histogram.NotNullCount(), float64(modifyCount)/histNDV)), nil + // As a conservative estimate - take the smaller of the orignal totalRows or the additions. + // "realtimeRowCount - original count" is a better measure of inserts than modifyCount + totalRowCount := min(c.NotNullCount(), float64(realtimeRowCount)-c.NotNullCount()) + return max(1, totalRowCount/histNDV), nil } + // return the average histogram rows (which excludes topN) and NDV that excluded topN return c.Histogram.NotNullCount() / histNDV, nil } diff --git a/pkg/planner/cardinality/row_count_index.go b/pkg/planner/cardinality/row_count_index.go index 6a711c54b5e6f..ea65698ca5383 100644 --- a/pkg/planner/cardinality/row_count_index.go +++ b/pkg/planner/cardinality/row_count_index.go @@ -423,11 +423,11 @@ func equalRowCountOnIndex(sctx planctx.PlanContext, idx *statistics.Index, b []b if idx.Histogram.NDV > 0 { histNDV = float64(idx.Histogram.NDV) } else { - histNDV = math.Sqrt(max(idx.Histogram.NotNullCount(), float64(modifyCount))) + histNDV = math.Sqrt(max(idx.TotalRowCount(), float64(realtimeRowCount))) } - // Return the min of the original notNullCount or the modifyCount/NDV. This is to reduce the - // risk of too large or too small an estimate if modifyCount is large or NotNullCount is small - return max(1, min(idx.Histogram.NotNullCount(), float64(modifyCount)/histNDV)) + // As a conservative estimate - take the smaller of the orignal totalRows or the difference + totalRowCount := min(idx.TotalRowCount(), float64(realtimeRowCount)-idx.TotalRowCount()) + return max(1, totalRowCount/histNDV) } return idx.Histogram.NotNullCount() / histNDV } diff --git a/pkg/planner/cardinality/selectivity_test.go b/pkg/planner/cardinality/selectivity_test.go index 7f065f89d75ab..6cf812881ca96 100644 --- a/pkg/planner/cardinality/selectivity_test.go +++ b/pkg/planner/cardinality/selectivity_test.go @@ -289,6 +289,57 @@ func TestEstimationForUnknownValues(t *testing.T) { require.Equal(t, 0.0, count) } +func TestEstimationForUnknownValuesAfterModify(t *testing.T) { + store, dom := testkit.CreateMockStoreAndDomain(t) + testKit := testkit.NewTestKit(t, store) + testKit.MustExec("use test") + testKit.MustExec("drop table if exists t") + testKit.MustExec("create table t(a int, key idx(a))") + testKit.MustExec("set @@tidb_analyze_version=2") + testKit.MustExec("set @@global.tidb_enable_auto_analyze='OFF'") + for i := 1; i <= 10; i++ { + testKit.MustExec(fmt.Sprintf("insert into t values (%d)", i)) + testKit.MustExec(fmt.Sprintf("insert into t values (%d)", i)) + testKit.MustExec(fmt.Sprintf("insert into t values (%d)", i)) + testKit.MustExec(fmt.Sprintf("insert into t values (%d)", i)) + testKit.MustExec(fmt.Sprintf("insert into t values (%d)", i)) + testKit.MustExec(fmt.Sprintf("insert into t select a from t where a = %d", i)) + } + testKit.MustExec("analyze table t") + h := dom.StatsHandle() + require.Nil(t, h.DumpStatsDeltaToKV(true)) + + table, err := dom.InfoSchema().TableByName(context.Background(), pmodel.NewCIStr("test"), pmodel.NewCIStr("t")) + require.NoError(t, err) + statsTbl := h.GetTableStats(table.Meta()) + + // Search for a found value == 10.0 + sctx := mock.NewContext() + col := statsTbl.GetCol(table.Meta().Columns[0].ID) + count, err := cardinality.GetColumnRowCount(sctx, col, getRange(5, 5), statsTbl.RealtimeCount, statsTbl.ModifyCount, false) + require.NoError(t, err) + require.Equal(t, 10.0, count) + + // Search for a not found value with zero modifyCount. Defaults to count == 1.0 + count, err = cardinality.GetColumnRowCount(sctx, col, getRange(11, 11), statsTbl.RealtimeCount, statsTbl.ModifyCount, false) + require.NoError(t, err) + require.Equal(t, 1.0, count) + + // Add another 200 rows to the table + testKit.MustExec("insert into t select a+10 from t") + testKit.MustExec("insert into t select a+10 from t where a <= 10") + //time.Sleep(5 * time.Second) + require.Nil(t, h.DumpStatsDeltaToKV(true)) + //statsTblnew := h.GetTableStats(table.Meta()) + + // Search for a not found value based upon statistics - count should be >= 10 and <=39 + // count, err = cardinality.GetColumnRowCount(sctx, col, getRange(15, 15), statsTblnew.RealtimeCount, statsTblnew.ModifyCount, false) + count, err = cardinality.GetColumnRowCount(sctx, col, getRange(15, 15), 300, 200, false) + require.NoError(t, err) + require.Truef(t, count < 40, "expected: between 10 to 39, got: %v", count) + require.Truef(t, count > 9, "expected: between 10 to 39, got: %v", count) +} + func TestEstimationUniqueKeyEqualConds(t *testing.T) { store, dom := testkit.CreateMockStoreAndDomain(t) testKit := testkit.NewTestKit(t, store) From 6aa663ab949fe875c77295860f7346b67ca92048 Mon Sep 17 00:00:00 2001 From: tpp Date: Wed, 6 Nov 2024 14:14:02 -0800 Subject: [PATCH 5/6] bazel --- pkg/planner/cardinality/BUILD.bazel | 2 +- pkg/planner/cardinality/row_count_column.go | 3 +++ pkg/planner/cardinality/row_count_index.go | 11 +++++++++-- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/pkg/planner/cardinality/BUILD.bazel b/pkg/planner/cardinality/BUILD.bazel index 0c5f0dc481ea8..36657805f4bce 100644 --- a/pkg/planner/cardinality/BUILD.bazel +++ b/pkg/planner/cardinality/BUILD.bazel @@ -59,7 +59,7 @@ go_test( data = glob(["testdata/**"]), embed = [":cardinality"], flaky = True, - shard_count = 28, + shard_count = 29, deps = [ "//pkg/config", "//pkg/domain", diff --git a/pkg/planner/cardinality/row_count_column.go b/pkg/planner/cardinality/row_count_column.go index 682f8e01667c5..fa1d8364ed6e3 100644 --- a/pkg/planner/cardinality/row_count_column.go +++ b/pkg/planner/cardinality/row_count_column.go @@ -175,6 +175,9 @@ func equalRowCountOnColumn(sctx planctx.PlanContext, c *statistics.Column, val t // 3. use uniform distribution assumption for the rest (even when this value is not covered by the range of stats) histNDV := float64(c.Histogram.NDV - int64(c.TopN.Num())) if histNDV <= 0 { + // If histNDV is zero - we have all NDV's in TopN - and no histograms. This function uses + // c.NotNullCount rather than c.Histogram.NotNullCount() since the histograms are empty. + // // If the table hasn't been modified, it's safe to return 0. if modifyCount == 0 { return 0, nil diff --git a/pkg/planner/cardinality/row_count_index.go b/pkg/planner/cardinality/row_count_index.go index ea65698ca5383..928395ae76eef 100644 --- a/pkg/planner/cardinality/row_count_index.go +++ b/pkg/planner/cardinality/row_count_index.go @@ -415,20 +415,27 @@ func equalRowCountOnIndex(sctx planctx.PlanContext, idx *statistics.Index, b []b // 3. use uniform distribution assumption for the rest (even when this value is not covered by the range of stats) histNDV := float64(idx.Histogram.NDV - int64(idx.TopN.Num())) if histNDV <= 0 { - // If the table hasn't been modified, it's safe to return 0. Otherwise, the TopN could be stale - return 1. + // If histNDV is zero - we have all NDV's in TopN - and no histograms. This function uses + // idx.TotalRowCount rather than idx.Histogram.NotNullCount() since the histograms are empty. + // + // If the table hasn't been modified, it's safe to return 0. if modifyCount == 0 { return 0 } + // ELSE calculate an approximate estimate based upon newly inserted rows. + // // Reset to the original NDV, or if no NDV - derive an NDV using sqrt if idx.Histogram.NDV > 0 { histNDV = float64(idx.Histogram.NDV) } else { histNDV = math.Sqrt(max(idx.TotalRowCount(), float64(realtimeRowCount))) } - // As a conservative estimate - take the smaller of the orignal totalRows or the difference + // As a conservative estimate - take the smaller of the orignal totalRows or the additions. + // "realtimeRowCount - original count" is a better measure of inserts than modifyCount totalRowCount := min(idx.TotalRowCount(), float64(realtimeRowCount)-idx.TotalRowCount()) return max(1, totalRowCount/histNDV) } + // return the average histogram rows (which excludes topN) and NDV that excluded topN return idx.Histogram.NotNullCount() / histNDV } From a6102b2ea6fbe299314181671c25a9402ead78eb Mon Sep 17 00:00:00 2001 From: tpp Date: Wed, 6 Nov 2024 17:03:17 -0800 Subject: [PATCH 6/6] corrected new testcase --- pkg/planner/cardinality/selectivity_test.go | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/pkg/planner/cardinality/selectivity_test.go b/pkg/planner/cardinality/selectivity_test.go index 6cf812881ca96..311e0e0f5301c 100644 --- a/pkg/planner/cardinality/selectivity_test.go +++ b/pkg/planner/cardinality/selectivity_test.go @@ -328,16 +328,15 @@ func TestEstimationForUnknownValuesAfterModify(t *testing.T) { // Add another 200 rows to the table testKit.MustExec("insert into t select a+10 from t") testKit.MustExec("insert into t select a+10 from t where a <= 10") - //time.Sleep(5 * time.Second) require.Nil(t, h.DumpStatsDeltaToKV(true)) - //statsTblnew := h.GetTableStats(table.Meta()) + require.Nil(t, h.Update(context.Background(), dom.InfoSchema())) + statsTblnew := h.GetTableStats(table.Meta()) - // Search for a not found value based upon statistics - count should be >= 10 and <=39 - // count, err = cardinality.GetColumnRowCount(sctx, col, getRange(15, 15), statsTblnew.RealtimeCount, statsTblnew.ModifyCount, false) - count, err = cardinality.GetColumnRowCount(sctx, col, getRange(15, 15), 300, 200, false) + // Search for a not found value based upon statistics - count should be >= 10 and <=40 + count, err = cardinality.GetColumnRowCount(sctx, col, getRange(15, 15), statsTblnew.RealtimeCount, statsTblnew.ModifyCount, false) require.NoError(t, err) - require.Truef(t, count < 40, "expected: between 10 to 39, got: %v", count) - require.Truef(t, count > 9, "expected: between 10 to 39, got: %v", count) + require.Truef(t, count < 41, "expected: between 10 to 40, got: %v", count) + require.Truef(t, count > 9, "expected: between 10 to 40, got: %v", count) } func TestEstimationUniqueKeyEqualConds(t *testing.T) {