Skip to content

Commit

Permalink
tidb graceful upgrade (#112)
Browse files Browse the repository at this point in the history
* tidb graceful upgrade
  • Loading branch information
xiaojingchen authored and weekface committed Oct 12, 2018
1 parent 6717b83 commit 2189d69
Show file tree
Hide file tree
Showing 7 changed files with 255 additions and 43 deletions.
9 changes: 5 additions & 4 deletions pkg/apis/pingcap.com/v1alpha1/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -197,10 +197,11 @@ type PDFailureMember struct {

// TiDBStatus is TiDB status
type TiDBStatus struct {
Phase MemberPhase `json:"phase,omitempty"`
StatefulSet *apps.StatefulSetStatus `json:"statefulSet,omitempty"`
Members map[string]TiDBMember `json:"members,omitempty"`
FailureMembers map[string]TiDBFailureMember `json:"failureMembers,omitempty"`
Phase MemberPhase `json:"phase,omitempty"`
StatefulSet *apps.StatefulSetStatus `json:"statefulSet,omitempty"`
Members map[string]TiDBMember `json:"members,omitempty"`
FailureMembers map[string]TiDBFailureMember `json:"failureMembers,omitempty"`
ResignDDLOwnerRetryCount int32 `json:"resignDDLOwnerRetryCount,omitempty"`
}

// TiDBMember is TiDB member
Expand Down
52 changes: 50 additions & 2 deletions pkg/controller/tidb_control.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,17 @@ import (
"github.com/pingcap/tidb-operator/pkg/apis/pingcap.com/v1alpha1"
)

const (
// NotDDLOwnerError is the error message which was returned when the tidb node is not a ddl owner
NotDDLOwnerError = "This node is not a ddl owner, can't be resigned."
)

// TiDBControlInterface is the interface that knows how to manage tidb peers
type TiDBControlInterface interface {
// GetHealth returns tidb's health info
GetHealth(tc *v1alpha1.TidbCluster) map[string]bool
// ResignDDLOwner resigns the ddl owner of tidb, if the tidb node is not a ddl owner returns (true,nil),else returns (false,err)
ResignDDLOwner(tc *v1alpha1.TidbCluster, ordinal int32) (bool, error)
}

// defaultTiDBControl is default implementation of TiDBControlInterface.
Expand All @@ -45,7 +52,7 @@ func (tdc *defaultTiDBControl) GetHealth(tc *v1alpha1.TidbCluster) map[string]bo
result := map[string]bool{}
for i := 0; i < int(tc.TiDBRealReplicas()); i++ {
hostName := fmt.Sprintf("%s-%d", TiDBMemberName(tcName), i)
url := fmt.Sprintf("http://%s.%s-tidb-peer.%s:10080/status", hostName, tcName, ns)
url := fmt.Sprintf("http://%s.%s.%s:10080/status", hostName, TiDBPeerMemberName(tcName), ns)
_, err := tdc.getBodyOK(url)
if err != nil {
result[hostName] = false
Expand All @@ -56,6 +63,31 @@ func (tdc *defaultTiDBControl) GetHealth(tc *v1alpha1.TidbCluster) map[string]bo
return result
}

func (tdc *defaultTiDBControl) ResignDDLOwner(tc *v1alpha1.TidbCluster, ordinal int32) (bool, error) {
tcName := tc.GetName()
ns := tc.GetNamespace()

hostName := fmt.Sprintf("%s-%d", TiDBMemberName(tcName), ordinal)
url := fmt.Sprintf("http://%s.%s.%s:10080/ddl/owner/resign", hostName, TiDBPeerMemberName(tcName), ns)
req, err := http.NewRequest("POST", url, nil)
if err != nil {
return false, err
}
res, err := tdc.httpClient.Do(req)
if err != nil {
return false, err
}
defer DeferClose(res.Body, &err)
if res.StatusCode == http.StatusOK {
return false, nil
}
err2 := readErrorBody(res.Body)
if err2.Error() == NotDDLOwnerError {
return true, nil
}
return false, err2
}

func (tdc *defaultTiDBControl) getBodyOK(apiURL string) ([]byte, error) {
res, err := tdc.httpClient.Get(apiURL)
if err != nil {
Expand All @@ -76,7 +108,9 @@ func (tdc *defaultTiDBControl) getBodyOK(apiURL string) ([]byte, error) {

// FakeTiDBControl is a fake implementation of TiDBControlInterface.
type FakeTiDBControl struct {
healthInfo map[string]bool
healthInfo map[string]bool
resignDDLOwnerError error
notDDLOwner bool
}

// NewFakeTiDBControl returns a FakeTiDBControl instance
Expand All @@ -89,6 +123,20 @@ func (ftd *FakeTiDBControl) SetHealth(healthInfo map[string]bool) {
ftd.healthInfo = healthInfo
}

// NotDDLOwner sets whether the tidb is the ddl owner
func (ftd *FakeTiDBControl) NotDDLOwner(notDDLOwner bool) {
ftd.notDDLOwner = notDDLOwner
}

// SetResignDDLOwner sets error of resign ddl owner for FakeTiDBControl
func (ftd *FakeTiDBControl) SetResignDDLOwnerError(err error) {
ftd.resignDDLOwnerError = err
}

func (ftd *FakeTiDBControl) GetHealth(_ *v1alpha1.TidbCluster) map[string]bool {
return ftd.healthInfo
}

func (ftd *FakeTiDBControl) ResignDDLOwner(tc *v1alpha1.TidbCluster, ordinal int32) (bool, error) {
return ftd.notDDLOwner, ftd.resignDDLOwnerError
}
6 changes: 3 additions & 3 deletions pkg/controller/tidbcluster/tidb_cluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,11 +100,11 @@ func NewController(
pdScaler := mm.NewPDScaler(pdControl, pvcInformer.Lister(), pvcControl)
tikvScaler := mm.NewTiKVScaler(pdControl, pvcInformer.Lister(), pvcControl)
pdFailover := mm.NewPDFailover(cli, pdControl, pdFailoverPeriod, podInformer.Lister(), podControl, pvcInformer.Lister(), pvcControl, pvInformer.Lister())
pdUpgrader := mm.NewPDUpgrader(pdControl, podControl, podInformer.Lister())
tikvFailover := mm.NewTiKVFailover(pdControl)
tikvUpgrader := mm.NewTiKVUpgrader(pdControl, podControl, podInformer.Lister())
tidbUpgrader := mm.NewTiDBUpgrader()
tidbFailover := mm.NewTiDBFailover(tidbFailoverPeriod)
pdUpgrader := mm.NewPDUpgrader(pdControl, podControl, podInformer.Lister())
tikvUpgrader := mm.NewTiKVUpgrader(pdControl, podControl, podInformer.Lister())
tidbUpgrader := mm.NewTiDBUpgrader(tidbControl)

tcc := &Controller{
kubeClient: kubeCli,
Expand Down
7 changes: 5 additions & 2 deletions pkg/manager/member/tidb_member_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ func (tmm *tidbMemberManager) syncTiDBStatefulSetForTidbCluster(tc *v1alpha1.Tid
return err
}

if !templateEqual(newTiDBSet.Spec.Template, oldTiDBSet.Spec.Template) {
if !templateEqual(newTiDBSet.Spec.Template, oldTiDBSet.Spec.Template) || tc.Status.TiDB.Phase == v1alpha1.UpgradePhase {
if err := tmm.tidbUpgrader.Upgrade(tc, oldTiDBSet, newTiDBSet); err != nil {
return err
}
Expand All @@ -155,6 +155,7 @@ func (tmm *tidbMemberManager) syncTiDBStatefulSetForTidbCluster(tc *v1alpha1.Tid
set := *oldTiDBSet
set.Spec.Template = newTiDBSet.Spec.Template
*set.Spec.Replicas = *newTiDBSet.Spec.Replicas
set.Spec.UpdateStrategy = newTiDBSet.Spec.UpdateStrategy
err := SetLastAppliedConfigAnnotation(&set)
if err != nil {
return err
Expand Down Expand Up @@ -322,7 +323,9 @@ func (tmm *tidbMemberManager) getNewTiDBSetForTidbCluster(tc *v1alpha1.TidbClust
},
ServiceName: controller.TiDBPeerMemberName(tcName),
PodManagementPolicy: apps.ParallelPodManagement,
UpdateStrategy: apps.StatefulSetUpdateStrategy{Type: apps.RollingUpdateStatefulSetStrategyType},
UpdateStrategy: apps.StatefulSetUpdateStrategy{Type: apps.RollingUpdateStatefulSetStrategyType,
RollingUpdate: &apps.RollingUpdateStatefulSetStrategy{Partition: func() *int32 { r := tc.TiDBRealReplicas(); return &r }()},
},
},
}
return tidbSet
Expand Down
51 changes: 46 additions & 5 deletions pkg/manager/member/tidb_upgrader.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,26 +15,67 @@ package member

import (
"github.com/pingcap/tidb-operator/pkg/apis/pingcap.com/v1alpha1"
"github.com/pingcap/tidb-operator/pkg/controller"
apps "k8s.io/api/apps/v1beta1"
)

type tidbUpgrader struct{}
const (
// MaxResignDDLOwnerCount is the max regign DDL owner count
MaxResignDDLOwnerCount = 3
)

type tidbUpgrader struct {
tidbControl controller.TiDBControlInterface
}

// NewTiDBUpgrader returns a tidb Upgrader
func NewTiDBUpgrader() Upgrader {
return &tidbUpgrader{}
func NewTiDBUpgrader(tidbControl controller.TiDBControlInterface) Upgrader {
return &tidbUpgrader{tidbControl: tidbControl}
}

func (tdu *tidbUpgrader) Upgrade(tc *v1alpha1.TidbCluster, oldSet *apps.StatefulSet, newSet *apps.StatefulSet) error {
ns := tc.GetNamespace()
tcName := tc.GetName()

if tc.Status.PD.Phase == v1alpha1.UpgradePhase || tc.Status.TiKV.Phase == v1alpha1.UpgradePhase {
_, podSpec, err := GetLastAppliedConfig(oldSet)
if err != nil {
return err
}
newSet.Spec.Template.Spec = *podSpec
} else {
tc.Status.TiDB.Phase = v1alpha1.UpgradePhase
return nil
}

tc.Status.TiDB.Phase = v1alpha1.UpgradePhase
setUpgradePartition(newSet, *oldSet.Spec.UpdateStrategy.RollingUpdate.Partition)

if tc.Status.TiDB.StatefulSet.CurrentReplicas == 0 {
return controller.RequeueErrorf("tidbcluster: [%s/%s]'s tidb doesn't have old version pod to upgrade", ns, tcName)
}

if !tc.TiDBAllPodsStarted() {
return controller.RequeueErrorf("tidbcluster: [%s/%s]'s tidb pods are not all created", ns, tcName)
}

for i := tc.Status.TiDB.StatefulSet.Replicas; i > tc.Status.TiDB.StatefulSet.CurrentReplicas; i-- {
if member, exist := tc.Status.TiDB.Members[tidbPodName(tcName, i-1)]; !exist || !member.Health {
return controller.RequeueErrorf("tidbcluster: [%s/%s]'s tidb upgraded pods are not all ready", ns, tcName)
}
}

upgradeOrdinal := tc.Status.TiDB.StatefulSet.CurrentReplicas - 1
if tc.Spec.TiDB.Replicas > 1 {
if member, exist := tc.Status.TiDB.Members[tidbPodName(tcName, upgradeOrdinal)]; exist && member.Health {
hasResign, err := tdu.tidbControl.ResignDDLOwner(tc, upgradeOrdinal)
if (!hasResign || err != nil) && tc.Status.TiDB.ResignDDLOwnerRetryCount < MaxResignDDLOwnerCount {
tc.Status.TiDB.ResignDDLOwnerRetryCount++
return err
}
}
}

tc.Status.TiDB.ResignDDLOwnerRetryCount = 0
setUpgradePartition(newSet, upgradeOrdinal)
return nil
}

Expand Down
Loading

0 comments on commit 2189d69

Please sign in to comment.