From f8f2c058ae07eb89e138cc097837c919ac8c9398 Mon Sep 17 00:00:00 2001
From: tpp <terry.purcell@pingcap.com>
Date: Fri, 25 Oct 2024 09:14:31 -0700
Subject: [PATCH 1/6] planner: Use modifyCount when all topN collected

---
 pkg/planner/cardinality/row_count_column.go | 2 +-
 pkg/planner/cardinality/row_count_index.go  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pkg/planner/cardinality/row_count_column.go b/pkg/planner/cardinality/row_count_column.go
index 5e6d75bf19a74..a28e636922ecd 100644
--- a/pkg/planner/cardinality/row_count_column.go
+++ b/pkg/planner/cardinality/row_count_column.go
@@ -177,7 +177,7 @@ func equalRowCountOnColumn(sctx planctx.PlanContext, c *statistics.Column, val t
 		if modifyCount == 0 {
 			return 0, nil
 		}
-		return 1, nil
+		return max(1, float64(modifyCount)/float64(c.Histogram.NDV)), nil
 	}
 	return c.Histogram.NotNullCount() / histNDV, nil
 }
diff --git a/pkg/planner/cardinality/row_count_index.go b/pkg/planner/cardinality/row_count_index.go
index 4393bf3b4552d..5f1197bdd9522 100644
--- a/pkg/planner/cardinality/row_count_index.go
+++ b/pkg/planner/cardinality/row_count_index.go
@@ -419,7 +419,7 @@ func equalRowCountOnIndex(sctx planctx.PlanContext, idx *statistics.Index, b []b
 		if modifyCount == 0 {
 			return 0
 		}
-		return 1
+		return max(1, float64(modifyCount)/float64(idx.Histogram.NDV))
 	}
 	return idx.Histogram.NotNullCount() / histNDV
 }

From 4a63d1f290e60c0776e8748bdd9ed3f00c3b3ec7 Mon Sep 17 00:00:00 2001
From: tpp <terry.purcell@pingcap.com>
Date: Mon, 28 Oct 2024 08:58:40 -0700
Subject: [PATCH 2/6] review comments

---
 pkg/planner/cardinality/row_count_column.go | 12 +++++++++++-
 pkg/planner/cardinality/row_count_index.go  | 10 +++++++++-
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/pkg/planner/cardinality/row_count_column.go b/pkg/planner/cardinality/row_count_column.go
index a28e636922ecd..e2a88f41447f7 100644
--- a/pkg/planner/cardinality/row_count_column.go
+++ b/pkg/planner/cardinality/row_count_column.go
@@ -15,6 +15,8 @@
 package cardinality
 
 import (
+	"math"
+
 	"github.com/pingcap/errors"
 	"github.com/pingcap/tidb/pkg/planner/planctx"
 	"github.com/pingcap/tidb/pkg/planner/util/debugtrace"
@@ -177,7 +179,15 @@ func equalRowCountOnColumn(sctx planctx.PlanContext, c *statistics.Column, val t
 		if modifyCount == 0 {
 			return 0, nil
 		}
-		return max(1, float64(modifyCount)/float64(c.Histogram.NDV)), nil
+		// Use sqrt to create an NDV if one doesn't already exist, or reset to the original NDV
+		if c.Histogram.NDV <= 0 {
+			histNDV = math.Sqrt(max(c.Histogram.NotNullCount(), float64(modifyCount)))
+		} else {
+			histNDV = float64(c.Histogram.NDV)
+		}
+		// Return the min of the original notNullCount or the modifyCount/NDV. This is to reduce the
+		// risk of too large or too small an estimate if modifyCount is large or NotNullCount is small
+		return max(1, min(c.Histogram.NotNullCount(), float64(modifyCount)/histNDV)), nil
 	}
 	return c.Histogram.NotNullCount() / histNDV, nil
 }
diff --git a/pkg/planner/cardinality/row_count_index.go b/pkg/planner/cardinality/row_count_index.go
index 5f1197bdd9522..a0951d762c875 100644
--- a/pkg/planner/cardinality/row_count_index.go
+++ b/pkg/planner/cardinality/row_count_index.go
@@ -419,7 +419,15 @@ func equalRowCountOnIndex(sctx planctx.PlanContext, idx *statistics.Index, b []b
 		if modifyCount == 0 {
 			return 0
 		}
-		return max(1, float64(modifyCount)/float64(idx.Histogram.NDV))
+		// Use sqrt to create an NDV if one doesn't already exist, or reset to the original NDV
+		if idx.Histogram.NDV <= 0 {
+			histNDV = math.Sqrt(max(idx.Histogram.NotNullCount(), float64(modifyCount)))
+		} else {
+			histNDV = float64(idx.Histogram.NDV)
+		}
+		// Return the min of the original notNullCount or the modifyCount/NDV. This is to reduce the
+		// risk of too large or too small an estimate if modifyCount is large or NotNullCount is small
+		return max(1, min(idx.Histogram.NotNullCount(), float64(modifyCount)/histNDV))
 	}
 	return idx.Histogram.NotNullCount() / histNDV
 }

From 4b5cf7029a3ba6876044b8e02efe621dd1a2c286 Mon Sep 17 00:00:00 2001
From: tpp <terry.purcell@pingcap.com>
Date: Fri, 1 Nov 2024 08:20:20 -0700
Subject: [PATCH 3/6] reverse IF statement to be NDV > 0

---
 pkg/planner/cardinality/row_count_column.go | 8 ++++----
 pkg/planner/cardinality/row_count_index.go  | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/pkg/planner/cardinality/row_count_column.go b/pkg/planner/cardinality/row_count_column.go
index e2a88f41447f7..48c549b42ae31 100644
--- a/pkg/planner/cardinality/row_count_column.go
+++ b/pkg/planner/cardinality/row_count_column.go
@@ -179,11 +179,11 @@ func equalRowCountOnColumn(sctx planctx.PlanContext, c *statistics.Column, val t
 		if modifyCount == 0 {
 			return 0, nil
 		}
-		// Use sqrt to create an NDV if one doesn't already exist, or reset to the original NDV
-		if c.Histogram.NDV <= 0 {
-			histNDV = math.Sqrt(max(c.Histogram.NotNullCount(), float64(modifyCount)))
-		} else {
+		// Reset to the original NDV, or if no NDV - derive an NDV using sqrt
+		if c.Histogram.NDV > 0 {
 			histNDV = float64(c.Histogram.NDV)
+		} else {
+			histNDV = math.Sqrt(max(c.Histogram.NotNullCount(), float64(modifyCount)))
 		}
 		// Return the min of the original notNullCount or the modifyCount/NDV. This is to reduce the
 		// risk of too large or too small an estimate if modifyCount is large or NotNullCount is small
diff --git a/pkg/planner/cardinality/row_count_index.go b/pkg/planner/cardinality/row_count_index.go
index a0951d762c875..3a6b1e018e9e8 100644
--- a/pkg/planner/cardinality/row_count_index.go
+++ b/pkg/planner/cardinality/row_count_index.go
@@ -419,11 +419,11 @@ func equalRowCountOnIndex(sctx planctx.PlanContext, idx *statistics.Index, b []b
 		if modifyCount == 0 {
 			return 0
 		}
-		// Use sqrt to create an NDV if one doesn't already exist, or reset to the original NDV
-		if idx.Histogram.NDV <= 0 {
-			histNDV = math.Sqrt(max(idx.Histogram.NotNullCount(), float64(modifyCount)))
-		} else {
+		// Reset to the original NDV, or if no NDV - derive an NDV using sqrt
+		if idx.Histogram.NDV > 0 {
 			histNDV = float64(idx.Histogram.NDV)
+		} else {
+			histNDV = math.Sqrt(max(idx.Histogram.NotNullCount(), float64(modifyCount)))
 		}
 		// Return the min of the original notNullCount or the modifyCount/NDV. This is to reduce the
 		// risk of too large or too small an estimate if modifyCount is large or NotNullCount is small

From 4de7e0eb702395776b018609aaaea8f72b05b1fe Mon Sep 17 00:00:00 2001
From: tpp <terry.purcell@pingcap.com>
Date: Wed, 6 Nov 2024 13:50:17 -0800
Subject: [PATCH 4/6] create new testcase

---
 pkg/planner/cardinality/row_count_column.go | 14 ++++--
 pkg/planner/cardinality/row_count_index.go  |  8 ++--
 pkg/planner/cardinality/selectivity_test.go | 51 +++++++++++++++++++++
 3 files changed, 64 insertions(+), 9 deletions(-)

diff --git a/pkg/planner/cardinality/row_count_column.go b/pkg/planner/cardinality/row_count_column.go
index 48c549b42ae31..682f8e01667c5 100644
--- a/pkg/planner/cardinality/row_count_column.go
+++ b/pkg/planner/cardinality/row_count_column.go
@@ -175,20 +175,24 @@ func equalRowCountOnColumn(sctx planctx.PlanContext, c *statistics.Column, val t
 	// 3. use uniform distribution assumption for the rest (even when this value is not covered by the range of stats)
 	histNDV := float64(c.Histogram.NDV - int64(c.TopN.Num()))
 	if histNDV <= 0 {
-		// If the table hasn't been modified, it's safe to return 0. Otherwise, the TopN could be stale - return 1.
+		// If the table hasn't been modified, it's safe to return 0.
 		if modifyCount == 0 {
 			return 0, nil
 		}
+		// ELSE calculate an approximate estimate based upon newly inserted rows.
+		//
 		// Reset to the original NDV, or if no NDV - derive an NDV using sqrt
 		if c.Histogram.NDV > 0 {
 			histNDV = float64(c.Histogram.NDV)
 		} else {
-			histNDV = math.Sqrt(max(c.Histogram.NotNullCount(), float64(modifyCount)))
+			histNDV = math.Sqrt(max(c.NotNullCount(), float64(realtimeRowCount)))
 		}
-		// Return the min of the original notNullCount or the modifyCount/NDV. This is to reduce the
-		// risk of too large or too small an estimate if modifyCount is large or NotNullCount is small
-		return max(1, min(c.Histogram.NotNullCount(), float64(modifyCount)/histNDV)), nil
+		// As a conservative estimate - take the smaller of the orignal totalRows or the additions.
+		// "realtimeRowCount - original count" is a better measure of inserts than modifyCount
+		totalRowCount := min(c.NotNullCount(), float64(realtimeRowCount)-c.NotNullCount())
+		return max(1, totalRowCount/histNDV), nil
 	}
+	// return the average histogram rows (which excludes topN) and NDV that excluded topN
 	return c.Histogram.NotNullCount() / histNDV, nil
 }
 
diff --git a/pkg/planner/cardinality/row_count_index.go b/pkg/planner/cardinality/row_count_index.go
index 6a711c54b5e6f..ea65698ca5383 100644
--- a/pkg/planner/cardinality/row_count_index.go
+++ b/pkg/planner/cardinality/row_count_index.go
@@ -423,11 +423,11 @@ func equalRowCountOnIndex(sctx planctx.PlanContext, idx *statistics.Index, b []b
 		if idx.Histogram.NDV > 0 {
 			histNDV = float64(idx.Histogram.NDV)
 		} else {
-			histNDV = math.Sqrt(max(idx.Histogram.NotNullCount(), float64(modifyCount)))
+			histNDV = math.Sqrt(max(idx.TotalRowCount(), float64(realtimeRowCount)))
 		}
-		// Return the min of the original notNullCount or the modifyCount/NDV. This is to reduce the
-		// risk of too large or too small an estimate if modifyCount is large or NotNullCount is small
-		return max(1, min(idx.Histogram.NotNullCount(), float64(modifyCount)/histNDV))
+		// As a conservative estimate - take the smaller of the orignal totalRows or the difference
+		totalRowCount := min(idx.TotalRowCount(), float64(realtimeRowCount)-idx.TotalRowCount())
+		return max(1, totalRowCount/histNDV)
 	}
 	return idx.Histogram.NotNullCount() / histNDV
 }
diff --git a/pkg/planner/cardinality/selectivity_test.go b/pkg/planner/cardinality/selectivity_test.go
index 7f065f89d75ab..6cf812881ca96 100644
--- a/pkg/planner/cardinality/selectivity_test.go
+++ b/pkg/planner/cardinality/selectivity_test.go
@@ -289,6 +289,57 @@ func TestEstimationForUnknownValues(t *testing.T) {
 	require.Equal(t, 0.0, count)
 }
 
+func TestEstimationForUnknownValuesAfterModify(t *testing.T) {
+	store, dom := testkit.CreateMockStoreAndDomain(t)
+	testKit := testkit.NewTestKit(t, store)
+	testKit.MustExec("use test")
+	testKit.MustExec("drop table if exists t")
+	testKit.MustExec("create table t(a int, key idx(a))")
+	testKit.MustExec("set @@tidb_analyze_version=2")
+	testKit.MustExec("set @@global.tidb_enable_auto_analyze='OFF'")
+	for i := 1; i <= 10; i++ {
+		testKit.MustExec(fmt.Sprintf("insert into t values (%d)", i))
+		testKit.MustExec(fmt.Sprintf("insert into t values (%d)", i))
+		testKit.MustExec(fmt.Sprintf("insert into t values (%d)", i))
+		testKit.MustExec(fmt.Sprintf("insert into t values (%d)", i))
+		testKit.MustExec(fmt.Sprintf("insert into t values (%d)", i))
+		testKit.MustExec(fmt.Sprintf("insert into t select a from t where a = %d", i))
+	}
+	testKit.MustExec("analyze table t")
+	h := dom.StatsHandle()
+	require.Nil(t, h.DumpStatsDeltaToKV(true))
+
+	table, err := dom.InfoSchema().TableByName(context.Background(), pmodel.NewCIStr("test"), pmodel.NewCIStr("t"))
+	require.NoError(t, err)
+	statsTbl := h.GetTableStats(table.Meta())
+
+	// Search for a found value == 10.0
+	sctx := mock.NewContext()
+	col := statsTbl.GetCol(table.Meta().Columns[0].ID)
+	count, err := cardinality.GetColumnRowCount(sctx, col, getRange(5, 5), statsTbl.RealtimeCount, statsTbl.ModifyCount, false)
+	require.NoError(t, err)
+	require.Equal(t, 10.0, count)
+
+	// Search for a not found value with zero modifyCount. Defaults to count == 1.0
+	count, err = cardinality.GetColumnRowCount(sctx, col, getRange(11, 11), statsTbl.RealtimeCount, statsTbl.ModifyCount, false)
+	require.NoError(t, err)
+	require.Equal(t, 1.0, count)
+
+	// Add another 200 rows to the table
+	testKit.MustExec("insert into t select a+10 from t")
+	testKit.MustExec("insert into t select a+10 from t where a <= 10")
+	//time.Sleep(5 * time.Second)
+	require.Nil(t, h.DumpStatsDeltaToKV(true))
+	//statsTblnew := h.GetTableStats(table.Meta())
+
+	// Search for a not found value based upon statistics - count should be >= 10 and <=39
+	//	count, err = cardinality.GetColumnRowCount(sctx, col, getRange(15, 15), statsTblnew.RealtimeCount, statsTblnew.ModifyCount, false)
+	count, err = cardinality.GetColumnRowCount(sctx, col, getRange(15, 15), 300, 200, false)
+	require.NoError(t, err)
+	require.Truef(t, count < 40, "expected: between 10 to 39, got: %v", count)
+	require.Truef(t, count > 9, "expected: between 10 to 39, got: %v", count)
+}
+
 func TestEstimationUniqueKeyEqualConds(t *testing.T) {
 	store, dom := testkit.CreateMockStoreAndDomain(t)
 	testKit := testkit.NewTestKit(t, store)

From 6aa663ab949fe875c77295860f7346b67ca92048 Mon Sep 17 00:00:00 2001
From: tpp <terry.purcell@pingcap.com>
Date: Wed, 6 Nov 2024 14:14:02 -0800
Subject: [PATCH 5/6] bazel

---
 pkg/planner/cardinality/BUILD.bazel         |  2 +-
 pkg/planner/cardinality/row_count_column.go |  3 +++
 pkg/planner/cardinality/row_count_index.go  | 11 +++++++++--
 3 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/pkg/planner/cardinality/BUILD.bazel b/pkg/planner/cardinality/BUILD.bazel
index 0c5f0dc481ea8..36657805f4bce 100644
--- a/pkg/planner/cardinality/BUILD.bazel
+++ b/pkg/planner/cardinality/BUILD.bazel
@@ -59,7 +59,7 @@ go_test(
     data = glob(["testdata/**"]),
     embed = [":cardinality"],
     flaky = True,
-    shard_count = 28,
+    shard_count = 29,
     deps = [
         "//pkg/config",
         "//pkg/domain",
diff --git a/pkg/planner/cardinality/row_count_column.go b/pkg/planner/cardinality/row_count_column.go
index 682f8e01667c5..fa1d8364ed6e3 100644
--- a/pkg/planner/cardinality/row_count_column.go
+++ b/pkg/planner/cardinality/row_count_column.go
@@ -175,6 +175,9 @@ func equalRowCountOnColumn(sctx planctx.PlanContext, c *statistics.Column, val t
 	// 3. use uniform distribution assumption for the rest (even when this value is not covered by the range of stats)
 	histNDV := float64(c.Histogram.NDV - int64(c.TopN.Num()))
 	if histNDV <= 0 {
+		// If histNDV is zero - we have all NDV's in TopN - and no histograms. This function uses
+		// c.NotNullCount rather than c.Histogram.NotNullCount() since the histograms are empty.
+		//
 		// If the table hasn't been modified, it's safe to return 0.
 		if modifyCount == 0 {
 			return 0, nil
diff --git a/pkg/planner/cardinality/row_count_index.go b/pkg/planner/cardinality/row_count_index.go
index ea65698ca5383..928395ae76eef 100644
--- a/pkg/planner/cardinality/row_count_index.go
+++ b/pkg/planner/cardinality/row_count_index.go
@@ -415,20 +415,27 @@ func equalRowCountOnIndex(sctx planctx.PlanContext, idx *statistics.Index, b []b
 	// 3. use uniform distribution assumption for the rest (even when this value is not covered by the range of stats)
 	histNDV := float64(idx.Histogram.NDV - int64(idx.TopN.Num()))
 	if histNDV <= 0 {
-		// If the table hasn't been modified, it's safe to return 0. Otherwise, the TopN could be stale - return 1.
+		// If histNDV is zero - we have all NDV's in TopN - and no histograms. This function uses
+		// idx.TotalRowCount rather than idx.Histogram.NotNullCount() since the histograms are empty.
+		//
+		// If the table hasn't been modified, it's safe to return 0.
 		if modifyCount == 0 {
 			return 0
 		}
+		// ELSE calculate an approximate estimate based upon newly inserted rows.
+		//
 		// Reset to the original NDV, or if no NDV - derive an NDV using sqrt
 		if idx.Histogram.NDV > 0 {
 			histNDV = float64(idx.Histogram.NDV)
 		} else {
 			histNDV = math.Sqrt(max(idx.TotalRowCount(), float64(realtimeRowCount)))
 		}
-		// As a conservative estimate - take the smaller of the orignal totalRows or the difference
+		// As a conservative estimate - take the smaller of the orignal totalRows or the additions.
+		// "realtimeRowCount - original count" is a better measure of inserts than modifyCount
 		totalRowCount := min(idx.TotalRowCount(), float64(realtimeRowCount)-idx.TotalRowCount())
 		return max(1, totalRowCount/histNDV)
 	}
+	// return the average histogram rows (which excludes topN) and NDV that excluded topN
 	return idx.Histogram.NotNullCount() / histNDV
 }
 

From a6102b2ea6fbe299314181671c25a9402ead78eb Mon Sep 17 00:00:00 2001
From: tpp <terry.purcell@pingcap.com>
Date: Wed, 6 Nov 2024 17:03:17 -0800
Subject: [PATCH 6/6] corrected new testcase

---
 pkg/planner/cardinality/selectivity_test.go | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/pkg/planner/cardinality/selectivity_test.go b/pkg/planner/cardinality/selectivity_test.go
index 6cf812881ca96..311e0e0f5301c 100644
--- a/pkg/planner/cardinality/selectivity_test.go
+++ b/pkg/planner/cardinality/selectivity_test.go
@@ -328,16 +328,15 @@ func TestEstimationForUnknownValuesAfterModify(t *testing.T) {
 	// Add another 200 rows to the table
 	testKit.MustExec("insert into t select a+10 from t")
 	testKit.MustExec("insert into t select a+10 from t where a <= 10")
-	//time.Sleep(5 * time.Second)
 	require.Nil(t, h.DumpStatsDeltaToKV(true))
-	//statsTblnew := h.GetTableStats(table.Meta())
+	require.Nil(t, h.Update(context.Background(), dom.InfoSchema()))
+	statsTblnew := h.GetTableStats(table.Meta())
 
-	// Search for a not found value based upon statistics - count should be >= 10 and <=39
-	//	count, err = cardinality.GetColumnRowCount(sctx, col, getRange(15, 15), statsTblnew.RealtimeCount, statsTblnew.ModifyCount, false)
-	count, err = cardinality.GetColumnRowCount(sctx, col, getRange(15, 15), 300, 200, false)
+	// Search for a not found value based upon statistics - count should be >= 10 and <=40
+	count, err = cardinality.GetColumnRowCount(sctx, col, getRange(15, 15), statsTblnew.RealtimeCount, statsTblnew.ModifyCount, false)
 	require.NoError(t, err)
-	require.Truef(t, count < 40, "expected: between 10 to 39, got: %v", count)
-	require.Truef(t, count > 9, "expected: between 10 to 39, got: %v", count)
+	require.Truef(t, count < 41, "expected: between 10 to 40, got: %v", count)
+	require.Truef(t, count > 9, "expected: between 10 to 40, got: %v", count)
 }
 
 func TestEstimationUniqueKeyEqualConds(t *testing.T) {