From 64bdc57d5eb8daa5554ac17c9ae009f4bd2607df Mon Sep 17 00:00:00 2001 From: Faizan Qazi Date: Tue, 7 Jan 2025 16:31:29 +0000 Subject: [PATCH] catalog/lease: handle transition to multi-region system database Previously, when converting the system database to multi-region we could run into transient errors during the process when counting leases. This would happen because the lease descriptor used by the transaction could be non-MR while the cached system database descriptor could be MR. This is expected since the two can be from different timestamps. To help address this, this patch will retry when InvalidFunction and InvalidParameterValue pgcodes with references to crdb_internal_region types are observed, which will happen if the lease table has not update the crdb_region columns type. The retry attempt will intentionally reference the region type as byte. Fixes: #138335 Release note: None --- pkg/sql/catalog/lease/count.go | 41 ++++++++++++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 2 deletions(-) diff --git a/pkg/sql/catalog/lease/count.go b/pkg/sql/catalog/lease/count.go index a25826f26011..2d68b8f3fe7b 100644 --- a/pkg/sql/catalog/lease/count.go +++ b/pkg/sql/catalog/lease/count.go @@ -18,6 +18,8 @@ import ( "github.com/cockroachdb/cockroach/pkg/sql/catalog/systemschema" "github.com/cockroachdb/cockroach/pkg/sql/enum" "github.com/cockroachdb/cockroach/pkg/sql/isql" + "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode" + "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror" "github.com/cockroachdb/cockroach/pkg/sql/regionliveness" "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" "github.com/cockroachdb/cockroach/pkg/sql/sessiondata" @@ -58,6 +60,30 @@ func CountLeases( return detail.count, nil } +// isRegionColumnError detects if a InvalidParameterValue or +// UndefinedFunction are observed because of the region column. +// This can happen because of the following reasons: +// 1. The currently leased system database is not multi-region, but the leased +// system.lease table is multi-region. +// 2. The currently leased system database is multi-region, but the system.lease +// descriptor we have is not multi-region. +// +// Both cases are transient and are a side effect of using a cache system +// database descriptor for checks. +func isTransientRegionColumnError(err error) bool { + // No error detected nothing else needs to be checked. + if err == nil { + return false + } + // Some unrelated error was observed, so this is not linked to the + // region column transitioning from bytes to crdb_region. + if pgerror.GetPGCode(err) != pgcode.UndefinedFunction && + pgerror.GetPGCode(err) != pgcode.InvalidParameterValue { + return false + } + return strings.Contains(err.Error(), "crdb_internal_region") +} + func countLeasesWithDetail( ctx context.Context, db isql.DB, @@ -83,6 +109,8 @@ func countLeasesWithDetail( clusterversion.RemoveDevOffset(clusterversion.V24_1_SessionBasedLeasingUpgradeDescriptor.Version())) leasingMode := readSessionBasedLeasingMode(ctx, settings) whereClauses := make([][]string, 2) + forceMultiRegionQuery := false + useBytesOnRetry := false for _, t := range versions { versionClause := "" if !forAnyVersion { @@ -145,14 +173,23 @@ func countLeasesWithDetail( err := txn.WithSyntheticDescriptors(descsToInject, func() error { var err error - if cachedDatabaseRegions != nil && cachedDatabaseRegions.IsMultiRegion() { + if (cachedDatabaseRegions != nil && cachedDatabaseRegions.IsMultiRegion()) || + forceMultiRegionQuery { // If we are injecting a raw leases descriptors, that will not have the enum // type set, so convert the region to byte equivalent physical representation. detail, err = countLeasesByRegion(ctx, txn, prober, regionMap, cachedDatabaseRegions, - len(descsToInject) > 0, at, whereClause, usesOldSchema[i]) + len(descsToInject) > 0 || useBytesOnRetry, at, whereClause, usesOldSchema[i]) } else { detail, err = countLeasesNonMultiRegion(ctx, txn, at, whereClause, usesOldSchema[i]) } + // If any transient region column errors occur then we should retry the count query. + if isTransientRegionColumnError(err) { + forceMultiRegionQuery = true + // If the query was already multi-region aware, then the system database is MR, + // but our lease descriptor has not been upgraded yet. + useBytesOnRetry = cachedDatabaseRegions != nil && cachedDatabaseRegions.IsMultiRegion() + return txn.KV().GenerateForcedRetryableErr(ctx, "forcing retry once with MR columns") + } return err }) if err != nil {