From 8b80987208be4c658691c4f848b924f391b5eded Mon Sep 17 00:00:00 2001 From: Xuecheng Zhang Date: Thu, 31 Oct 2024 20:41:35 +0800 Subject: [PATCH 1/2] This is an automated cherry-pick of #5827 Signed-off-by: ti-chi-bot --- pkg/manager/member/pd_upgrader.go | 28 ++++++++++++++++++++++++++ pkg/manager/member/pd_upgrader_test.go | 24 ++++++++++++++++++++++ 2 files changed, 52 insertions(+) diff --git a/pkg/manager/member/pd_upgrader.go b/pkg/manager/member/pd_upgrader.go index 0c0a5410536..0c7561dc6b4 100644 --- a/pkg/manager/member/pd_upgrader.go +++ b/pkg/manager/member/pd_upgrader.go @@ -15,6 +15,7 @@ package member import ( "fmt" + "strconv" "github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1" "github.com/pingcap/tidb-operator/pkg/controller" @@ -23,6 +24,8 @@ import ( "github.com/pingcap/advanced-statefulset/client/apis/apps/v1/helper" apps "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/klog/v2" podutil "k8s.io/kubernetes/pkg/api/v1/pod" ) @@ -30,6 +33,10 @@ import ( const ( // set this PD clustre annotation to true to fail cluster upgrade if PD loose the quorum during one pod restart annoKeyPDPeersCheck = "tidb.pingcap.com/pd-check-quorum-before-upgrade" + + // TODO: change to use minReadySeconds in sts spec + // See https://kubernetes.io/blog/2021/08/27/minreadyseconds-statefulsets/ + annoKeyPDMinReadySeconds = "tidb.pingcap.com/pd-min-ready-seconds" ) type pdUpgrader struct { @@ -79,6 +86,17 @@ func (u *pdUpgrader) gracefulUpgrade(tc *v1alpha1.TidbCluster, oldSet *apps.Stat return nil } + minReadySeconds := 0 + s, ok := tc.Annotations[annoKeyPDMinReadySeconds] + if ok { + i, err := strconv.Atoi(s) + if err != nil { + klog.Warningf("tidbcluster: [%s/%s] annotation %s should be an integer: %v", ns, tcName, annoKeyPDMinReadySeconds, err) + } else { + minReadySeconds = i + } + } + mngerutils.SetUpgradePartition(newSet, *oldSet.Spec.UpdateStrategy.RollingUpdate.Partition) podOrdinals := helper.GetPodOrdinals(*oldSet.Spec.Replicas, oldSet).List() for _i := len(podOrdinals) - 1; _i >= 0; _i-- { @@ -95,8 +113,18 @@ func (u *pdUpgrader) gracefulUpgrade(tc *v1alpha1.TidbCluster, oldSet *apps.Stat } if revision == tc.Status.PD.StatefulSet.UpdateRevision { +<<<<<<< HEAD if !podutil.IsPodReady(pod) { return controller.RequeueErrorf("tidbcluster: [%s/%s]'s upgraded pd pod: [%s] is not ready", ns, tcName, podName) +======= + if !k8s.IsPodAvailable(pod, int32(minReadySeconds), metav1.Now()) { + readyCond := k8s.GetPodReadyCondition(pod.Status) + if readyCond == nil || readyCond.Status != corev1.ConditionTrue { + return controller.RequeueErrorf("tidbcluster: [%s/%s]'s upgraded pd pod: [%s] is not ready", ns, tcName, podName) + + } + return controller.RequeueErrorf("tidbcluster: [%s/%s]'s upgraded pd pod: [%s] is not available, last transition time is %v", ns, tcName, podName, readyCond.LastTransitionTime) +>>>>>>> cb809c895 (feat(minReadySeconds): support minReadySeconds for PD (#5827)) } if member, exist := tc.Status.PD.Members[PdName(tc.Name, i, tc.Namespace, tc.Spec.ClusterDomain, tc.Spec.AcrossK8s)]; !exist || !member.Health { return controller.RequeueErrorf("tidbcluster: [%s/%s]'s pd upgraded pod: [%s] is not health", ns, tcName, podName) diff --git a/pkg/manager/member/pd_upgrader_test.go b/pkg/manager/member/pd_upgrader_test.go index 5eb2008d0c5..45f6cc5d813 100644 --- a/pkg/manager/member/pd_upgrader_test.go +++ b/pkg/manager/member/pd_upgrader_test.go @@ -314,6 +314,30 @@ func TestPDUpgraderUpgrade(t *testing.T) { g.Expect(newSet.Spec.UpdateStrategy.RollingUpdate.Partition).To(Equal(pointer.Int32Ptr(1))) }, }, + { + name: "upgraded pod is ready but not available", + changeFn: func(tc *v1alpha1.TidbCluster) { + tc.Status.PD.Synced = true + if tc.Annotations == nil { + tc.Annotations = map[string]string{} + } + // 5min is enough for unit test + tc.Annotations[annoKeyTiDBMinReadySeconds] = "300" + }, + changePods: func(pods []*corev1.Pod) { + pods[1].Status.Conditions[0].LastTransitionTime = metav1.Now() + }, + changeOldSet: nil, + transferLeaderErr: false, + pdPeersAreUnstable: true, + errExpectFn: func(g *GomegaWithT, err error) { + g.Expect(err).NotTo(HaveOccurred()) + }, + expectFn: func(g *GomegaWithT, tc *v1alpha1.TidbCluster, newSet *apps.StatefulSet) { + g.Expect(tc.Status.PD.Phase).To(Equal(v1alpha1.UpgradePhase)) + g.Expect(newSet.Spec.UpdateStrategy.RollingUpdate.Partition).To(Equal(pointer.Int32Ptr(1))) + }, + }, } for i := range tests { From 2addf913b7f7ee4a048b996bbe92377b8116c85c Mon Sep 17 00:00:00 2001 From: csuzhangxc Date: Fri, 1 Nov 2024 01:57:48 +0000 Subject: [PATCH 2/2] resolve conflicts --- pkg/manager/member/pd_upgrader.go | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/pkg/manager/member/pd_upgrader.go b/pkg/manager/member/pd_upgrader.go index 0c7561dc6b4..c5336d3dd76 100644 --- a/pkg/manager/member/pd_upgrader.go +++ b/pkg/manager/member/pd_upgrader.go @@ -113,18 +113,13 @@ func (u *pdUpgrader) gracefulUpgrade(tc *v1alpha1.TidbCluster, oldSet *apps.Stat } if revision == tc.Status.PD.StatefulSet.UpdateRevision { -<<<<<<< HEAD - if !podutil.IsPodReady(pod) { - return controller.RequeueErrorf("tidbcluster: [%s/%s]'s upgraded pd pod: [%s] is not ready", ns, tcName, podName) -======= - if !k8s.IsPodAvailable(pod, int32(minReadySeconds), metav1.Now()) { - readyCond := k8s.GetPodReadyCondition(pod.Status) + if !podutil.IsPodAvailable(pod, int32(minReadySeconds), metav1.Now()) { + readyCond := podutil.GetPodReadyCondition(pod.Status) if readyCond == nil || readyCond.Status != corev1.ConditionTrue { return controller.RequeueErrorf("tidbcluster: [%s/%s]'s upgraded pd pod: [%s] is not ready", ns, tcName, podName) } return controller.RequeueErrorf("tidbcluster: [%s/%s]'s upgraded pd pod: [%s] is not available, last transition time is %v", ns, tcName, podName, readyCond.LastTransitionTime) ->>>>>>> cb809c895 (feat(minReadySeconds): support minReadySeconds for PD (#5827)) } if member, exist := tc.Status.PD.Members[PdName(tc.Name, i, tc.Namespace, tc.Spec.ClusterDomain, tc.Spec.AcrossK8s)]; !exist || !member.Health { return controller.RequeueErrorf("tidbcluster: [%s/%s]'s pd upgraded pod: [%s] is not health", ns, tcName, podName)