Skip to content

Commit

Permalink
planner: Refactor out-of-range estimation based upon modifyCount (#57431
Browse files Browse the repository at this point in the history
)

close #58068
  • Loading branch information
terry1purcell authored Dec 18, 2024
1 parent aa19d3f commit a3574aa
Show file tree
Hide file tree
Showing 6 changed files with 160 additions and 146 deletions.
24 changes: 4 additions & 20 deletions pkg/planner/cardinality/row_count_column.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@
package cardinality

import (
"math"

"github.com/pingcap/errors"
"github.com/pingcap/tidb/pkg/planner/planctx"
"github.com/pingcap/tidb/pkg/planner/util/debugtrace"
Expand Down Expand Up @@ -177,23 +175,9 @@ func equalRowCountOnColumn(sctx planctx.PlanContext, c *statistics.Column, val t
if histNDV <= 0 {
// If histNDV is zero - we have all NDV's in TopN - and no histograms. This function uses
// c.NotNullCount rather than c.Histogram.NotNullCount() since the histograms are empty.
//
// If the table hasn't been modified, it's safe to return 0.
if modifyCount == 0 {
return 0, nil
}
// ELSE calculate an approximate estimate based upon newly inserted rows.
//
// Reset to the original NDV, or if no NDV - derive an NDV using sqrt
if c.Histogram.NDV > 0 {
histNDV = float64(c.Histogram.NDV)
} else {
histNDV = math.Sqrt(max(c.NotNullCount(), float64(realtimeRowCount)))
}
// As a conservative estimate - take the smaller of the orignal totalRows or the additions.
// "realtimeRowCount - original count" is a better measure of inserts than modifyCount
totalRowCount := min(c.NotNullCount(), float64(realtimeRowCount)-c.NotNullCount())
return max(1, totalRowCount/histNDV), nil
// c.Histogram.NDV stores the full NDV regardless of histograms empty or populated.
increaseFactor := c.GetIncreaseFactor(realtimeRowCount)
return outOfRangeFullNDV(float64(c.Histogram.NDV), c.TotalRowCount(), c.NotNullCount(), float64(realtimeRowCount), increaseFactor, modifyCount), nil
}
// return the average histogram rows (which excludes topN) and NDV that excluded topN
return c.Histogram.NotNullCount() / histNDV, nil
Expand Down Expand Up @@ -322,7 +306,7 @@ func GetColumnRowCount(sctx planctx.PlanContext, c *statistics.Column, ranges []
if c.StatsVer == statistics.Version2 {
histNDV -= int64(c.TopN.Num())
}
cnt += c.Histogram.OutOfRangeRowCount(sctx, &lowVal, &highVal, modifyCount, histNDV, increaseFactor)
cnt += c.Histogram.OutOfRangeRowCount(sctx, &lowVal, &highVal, realtimeRowCount, modifyCount, histNDV)
}

if debugTrace {
Expand Down
29 changes: 9 additions & 20 deletions pkg/planner/cardinality/row_count_index.go
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ func isSingleColIdxNullRange(idx *statistics.Index, ran *ranger.Range) bool {
return false
}

// It uses the modifyCount to adjust the influence of modifications on the table.
// It uses the modifyCount to validate, and realtimeRowCount to adjust the influence of modifications on the table.
func getIndexRowCountForStatsV2(sctx planctx.PlanContext, idx *statistics.Index, coll *statistics.HistColl, indexRanges []*ranger.Range, realtimeRowCount, modifyCount int64) (float64, error) {
sc := sctx.GetSessionVars().StmtCtx
debugTrace := sc.EnableOptimizerDebugTrace
Expand Down Expand Up @@ -350,7 +350,7 @@ func getIndexRowCountForStatsV2(sctx planctx.PlanContext, idx *statistics.Index,
histNDV -= int64(idx.TopN.Num())
}
}
count += idx.Histogram.OutOfRangeRowCount(sctx, &l, &r, modifyCount, histNDV, increaseFactor)
count += idx.Histogram.OutOfRangeRowCount(sctx, &l, &r, realtimeRowCount, modifyCount, histNDV)
}

if debugTrace {
Expand Down Expand Up @@ -415,25 +415,14 @@ func equalRowCountOnIndex(sctx planctx.PlanContext, idx *statistics.Index, b []b
// 3. use uniform distribution assumption for the rest (even when this value is not covered by the range of stats)
histNDV := float64(idx.Histogram.NDV - int64(idx.TopN.Num()))
if histNDV <= 0 {
// If histNDV is zero - we have all NDV's in TopN - and no histograms. This function uses
// idx.TotalRowCount rather than idx.Histogram.NotNullCount() since the histograms are empty.
//
// If the table hasn't been modified, it's safe to return 0.
if modifyCount == 0 {
return 0
}
// ELSE calculate an approximate estimate based upon newly inserted rows.
//
// Reset to the original NDV, or if no NDV - derive an NDV using sqrt
if idx.Histogram.NDV > 0 {
histNDV = float64(idx.Histogram.NDV)
} else {
histNDV = math.Sqrt(max(idx.TotalRowCount(), float64(realtimeRowCount)))
// If histNDV is zero - we have all NDV's in TopN - and no histograms.
// The histogram wont have a NotNullCount - so it needs to be derived.
notNullCount := idx.Histogram.NotNullCount()
if notNullCount <= 0 {
notNullCount = idx.TotalRowCount() - float64(idx.Histogram.NullCount)
}
// As a conservative estimate - take the smaller of the orignal totalRows or the additions.
// "realtimeRowCount - original count" is a better measure of inserts than modifyCount
totalRowCount := min(idx.TotalRowCount(), float64(realtimeRowCount)-idx.TotalRowCount())
return max(1, totalRowCount/histNDV)
increaseFactor := idx.GetIncreaseFactor(realtimeRowCount)
return outOfRangeFullNDV(float64(idx.Histogram.NDV), idx.TotalRowCount(), notNullCount, float64(realtimeRowCount), increaseFactor, modifyCount)
}
// return the average histogram rows (which excludes topN) and NDV that excluded topN
return idx.Histogram.NotNullCount() / histNDV
Expand Down
30 changes: 30 additions & 0 deletions pkg/planner/cardinality/selectivity.go
Original file line number Diff line number Diff line change
Expand Up @@ -1097,6 +1097,36 @@ func outOfRangeEQSelectivity(sctx planctx.PlanContext, ndv, realtimeRowCount, co
return selectivity
}

// outOfRangeFullNDV estimates the number of qualified rows when the topN represents all NDV values
// and the searched value does not appear in the topN
func outOfRangeFullNDV(ndv, origRowCount, notNullCount, realtimeRowCount, increaseFactor float64, modifyCount int64) (result float64) {
// If the table hasn't been modified, it's safe to return 0.
if modifyCount == 0 {
return 0
}
// Calculate "newly added rows" using original row count. We do NOT use notNullCount here
// because that can always be less than realtimeRowCount if NULLs exist
newRows := realtimeRowCount - origRowCount
// If the original row count is zero - take the min of original row count and realtimeRowCount
if notNullCount <= 0 {
notNullCount = min(origRowCount, realtimeRowCount)
}
// If realtimeRowCount has reduced below the original, we can't determine if there has been a
// combination of inserts/updates/deletes or only deletes - any out of range estimate is unreliable
if newRows < 0 {
newRows = min(notNullCount, realtimeRowCount)
}
// if no NDV - derive an NDV using sqrt
if ndv <= 0 {
ndv = math.Sqrt(max(notNullCount, realtimeRowCount))
} else {
// We need to increase the ndv by increaseFactor because the estimate will be increased by
// the caller of the function
ndv *= increaseFactor
}
return max(1, newRows/ndv)
}

// crossValidationSelectivity gets the selectivity of multi-column equal conditions by cross validation.
func crossValidationSelectivity(
sctx planctx.PlanContext,
Expand Down
22 changes: 11 additions & 11 deletions pkg/planner/cardinality/selectivity_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -245,11 +245,11 @@ func TestEstimationForUnknownValues(t *testing.T) {

count, err = cardinality.GetRowCountByColumnRanges(sctx, &statsTbl.HistColl, colID, getRange(9, 30))
require.NoError(t, err)
require.Equal(t, 7.2, count)
require.Equal(t, 12.2, count)

count, err = cardinality.GetRowCountByColumnRanges(sctx, &statsTbl.HistColl, colID, getRange(9, math.MaxInt64))
require.NoError(t, err)
require.Equal(t, 7.2, count)
require.Equal(t, 12.2, count)

idxID := table.Meta().Indices[0].ID
count, err = cardinality.GetRowCountByIndexRanges(sctx, &statsTbl.HistColl, idxID, getRange(30, 30))
Expand All @@ -258,7 +258,7 @@ func TestEstimationForUnknownValues(t *testing.T) {

count, err = cardinality.GetRowCountByIndexRanges(sctx, &statsTbl.HistColl, idxID, getRange(9, 30))
require.NoError(t, err)
require.Equal(t, 7.0, count)
require.Equal(t, 10.0, count)

testKit.MustExec("truncate table t")
testKit.MustExec("insert into t values (null, null)")
Expand Down Expand Up @@ -332,13 +332,13 @@ func TestEstimationForUnknownValuesAfterModify(t *testing.T) {
testKit.MustExec("insert into t select a+10 from t where a <= 10")
require.Nil(t, h.DumpStatsDeltaToKV(true))
require.Nil(t, h.Update(context.Background(), dom.InfoSchema()))
statsTblnew := h.GetTableStats(table.Meta())
statsTblNew := h.GetTableStats(table.Meta())

// Search for a not found value based upon statistics - count should be >= 10 and <=40
count, err = cardinality.GetColumnRowCount(sctx, col, getRange(15, 15), statsTblnew.RealtimeCount, statsTblnew.ModifyCount, false)
// Search for a not found value based upon statistics - count should be > 20 and < 40
count, err = cardinality.GetColumnRowCount(sctx, col, getRange(15, 15), statsTblNew.RealtimeCount, statsTblNew.ModifyCount, false)
require.NoError(t, err)
require.Truef(t, count < 41, "expected: between 10 to 40, got: %v", count)
require.Truef(t, count > 9, "expected: between 10 to 40, got: %v", count)
require.Truef(t, count < 40, "expected: between 20 to 40, got: %v", count)
require.Truef(t, count > 20, "expected: between 20 to 40, got: %v", count)
}

func TestEstimationUniqueKeyEqualConds(t *testing.T) {
Expand Down Expand Up @@ -463,12 +463,12 @@ func TestSelectivity(t *testing.T) {
{
exprs: "a >= 1 and c > 1 and a < 2",
selectivity: 0.00617283950,
selectivityAfterIncrease: 0.00617283950,
selectivityAfterIncrease: 0.006378600823045267,
},
{
exprs: "a >= 1 and c >= 1 and a < 2",
selectivity: 0.01234567901,
selectivityAfterIncrease: 0.01234567901,
selectivityAfterIncrease: 0.012551440329218106,
},
{
exprs: "d = 0 and e = 1",
Expand All @@ -483,7 +483,7 @@ func TestSelectivity(t *testing.T) {
{
exprs: "a > 1 and b < 2 and c > 3 and d < 4 and e > 5",
selectivity: 5.870830440255832e-05,
selectivityAfterIncrease: 1.51329827770157e-05,
selectivityAfterIncrease: 0.005967078189300412,
},
{
exprs: longExpr,
Expand Down
Loading

0 comments on commit a3574aa

Please sign in to comment.