diff --git a/controllers/elfmachine_controller.go b/controllers/elfmachine_controller.go index 4373b613..5dde8969 100644 --- a/controllers/elfmachine_controller.go +++ b/controllers/elfmachine_controller.go @@ -480,13 +480,17 @@ func (r *ElfMachineReconciler) reconcileVM(ctx *context.MachineContext) (*models return nil, false, errors.New("bootstrapData is empty") } - if ok := isElfClusterMemoryInsufficient(ctx.ElfCluster.Spec.Cluster); ok { - if canRetry := canRetryVMOperation(ctx.ElfCluster.Spec.Cluster); !canRetry { - ctx.Logger.V(1).Info(fmt.Sprintf("Insufficient memory for ELF cluster %s, skip creating VM", ctx.ElfCluster.Spec.Cluster)) + if ok, message, err := isELFScheduleVMErrorRecorded(ctx); err != nil { + return nil, false, err + } else if ok { + if canRetry, err := canRetryVMOperation(ctx); err != nil { + return nil, false, err + } else if !canRetry { + ctx.Logger.V(1).Info(fmt.Sprintf("%s, skip creating VM", message)) return nil, false, nil } - ctx.Logger.V(1).Info(fmt.Sprintf("Insufficient memory for ELF cluster %s, try to create VM", ctx.ElfCluster.Spec.Cluster)) + ctx.Logger.V(1).Info(fmt.Sprintf("%s and the retry silence period passes, will try to create the VM again", message)) } // Only limit the virtual machines of the worker nodes @@ -737,14 +741,18 @@ func (r *ElfMachineReconciler) powerOffVM(ctx *context.MachineContext) error { } func (r *ElfMachineReconciler) powerOnVM(ctx *context.MachineContext) error { - if ok := isElfClusterMemoryInsufficient(ctx.ElfCluster.Spec.Cluster); ok { - if canRetry := canRetryVMOperation(ctx.ElfCluster.Spec.Cluster); !canRetry { - ctx.Logger.V(1).Info(fmt.Sprintf("Insufficient memory for ELF cluster %s, skip powering on VM %s", ctx.ElfCluster.Spec.Cluster, ctx.ElfMachine.Status.VMRef)) + if ok, message, err := isELFScheduleVMErrorRecorded(ctx); err != nil { + return err + } else if ok { + if canRetry, err := canRetryVMOperation(ctx); err != nil { + return err + } else if !canRetry { + ctx.Logger.V(1).Info(fmt.Sprintf("%s, skip powering on VM %s", message, ctx.ElfMachine.Status.VMRef)) return nil } - ctx.Logger.V(1).Info(fmt.Sprintf("Insufficient memory for the ELF cluster %s was detected previously, try to power on VM %s to check if the ELF cluster has sufficient memory now", ctx.ElfCluster.Spec.Cluster, ctx.ElfMachine.Status.VMRef)) + ctx.Logger.V(1).Info(fmt.Sprintf("%s and the retry silence period passes, will try to power on the VM again", message)) } if ok := acquireTicketForUpdatingVM(ctx.ElfMachine.Name); !ok { @@ -843,8 +851,16 @@ func (r *ElfMachineReconciler) reconcileVMTask(ctx *context.MachineContext, vm * case service.IsCloneVMTask(task): releaseTicketForCreateVM(ctx.ElfMachine.Name) case service.IsMemoryInsufficientError(errorMessage): - setElfClusterMemoryInsufficient(ctx.ElfCluster.Spec.Cluster, true) - message := fmt.Sprintf("Insufficient memory detected for ELF cluster %s", ctx.ElfCluster.Spec.Cluster) + recordElfClusterMemoryInsufficient(ctx, true) + message := fmt.Sprintf("Insufficient memory detected for the ELF cluster %s", ctx.ElfCluster.Spec.Cluster) + ctx.Logger.Info(message) + + return true, errors.New(message) + case service.IsPlacementGroupError(errorMessage): + if err := recordPlacementGroupPolicyNotSatisfied(ctx, true); err != nil { + return true, err + } + message := "The placement group policy can not be satisfied" ctx.Logger.Info(message) return true, errors.New(message) @@ -853,8 +869,11 @@ func (r *ElfMachineReconciler) reconcileVMTask(ctx *context.MachineContext, vm * ctx.Logger.Info("VM task succeeded", "vmRef", vmRef, "taskRef", taskRef, "taskDescription", service.GetTowerString(task.Description)) if service.IsCloneVMTask(task) || service.IsPowerOnVMTask(task) { - setElfClusterMemoryInsufficient(ctx.ElfCluster.Spec.Cluster, false) releaseTicketForCreateVM(ctx.ElfMachine.Name) + recordElfClusterMemoryInsufficient(ctx, false) + if err := recordPlacementGroupPolicyNotSatisfied(ctx, false); err != nil { + return true, err + } } default: ctx.Logger.Info("Waiting for VM task done", "vmRef", vmRef, "taskRef", taskRef, "taskStatus", service.GetTowerTaskStatus(task.Status), "taskDescription", service.GetTowerString(task.Description)) diff --git a/controllers/elfmachine_controller_test.go b/controllers/elfmachine_controller_test.go index fbd84a0f..543eca47 100644 --- a/controllers/elfmachine_controller_test.go +++ b/controllers/elfmachine_controller_test.go @@ -267,27 +267,44 @@ var _ = Describe("ElfMachineReconciler", func() { }) It("should create a new VM if none exists", func() { + resetClusterResourceMap() vm := fake.NewTowerVM() vm.Name = &elfMachine.Name + elfCluster.Spec.Cluster = clusterKey task := fake.NewTowerTask() withTaskVM := fake.NewWithTaskVM(vm, task) ctrlContext := newCtrlContexts(elfCluster, cluster, elfMachine, machine, secret, md) fake.InitOwnerReferences(ctrlContext, elfCluster, cluster, elfMachine, machine) + machineContext := newMachineContext(ctrlContext, elfCluster, cluster, elfMachine, machine, mockVMService) + machineContext.VMService = mockVMService + recordIsUnmet(machineContext, clusterKey, true) + reconciler := &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService} + elfMachineKey := capiutil.ObjectKey(elfMachine) + result, err := reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: elfMachineKey}) + Expect(result.RequeueAfter).NotTo(BeZero()) + Expect(err).NotTo(HaveOccurred()) + Expect(logBuffer.String()).To(ContainSubstring("Insufficient memory detected for the ELF cluster")) + + logBuffer = new(bytes.Buffer) + klog.SetOutput(logBuffer) mockVMService.EXPECT().Clone(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(withTaskVM, nil) mockVMService.EXPECT().Get(*vm.ID).Return(vm, nil) mockVMService.EXPECT().GetTask(*task.ID).Return(task, nil) + mockVMService.EXPECT().GetVMPlacementGroup(gomock.Any()).Return(placementGroup, nil) + expireELFScheduleVMError(machineContext, clusterKey) - reconciler := &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService} - elfMachineKey := capiutil.ObjectKey(elfMachine) - result, err := reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: elfMachineKey}) + reconciler = &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService} + result, err = reconciler.Reconcile(ctx, ctrl.Request{NamespacedName: elfMachineKey}) Expect(result.RequeueAfter).NotTo(BeZero()) Expect(err).ShouldNot(HaveOccurred()) + Expect(logBuffer.String()).To(ContainSubstring("and the retry silence period passes, will try to create the VM again")) Expect(logBuffer.String()).To(ContainSubstring("Waiting for VM task done")) elfMachine = &infrav1.ElfMachine{} Expect(reconciler.Client.Get(reconciler, elfMachineKey, elfMachine)).To(Succeed()) Expect(elfMachine.Status.VMRef).To(Equal(*vm.ID)) Expect(elfMachine.Status.TaskRef).To(Equal(*task.ID)) + resetClusterResourceMap() }) It("should recover from lost task", func() { @@ -789,6 +806,33 @@ var _ = Describe("ElfMachineReconciler", func() { Expect(err).NotTo(HaveOccurred()) expectConditions(elfMachine, []conditionAssertion{{infrav1.VMProvisionedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.PowerOffReason}}) }) + + Context("powerOnVM", func() { + It("should", func() { + resetClusterResourceMap() + vm := fake.NewTowerVM() + elfMachine.Status.VMRef = *vm.LocalID + elfCluster.Spec.Cluster = clusterKey + ctrlContext := newCtrlContexts(elfCluster, cluster, elfMachine, machine, secret, md) + fake.InitOwnerReferences(ctrlContext, elfCluster, cluster, elfMachine, machine) + machineContext := newMachineContext(ctrlContext, elfCluster, cluster, elfMachine, machine, mockVMService) + machineContext.VMService = mockVMService + recordIsUnmet(machineContext, clusterKey, true) + reconciler := &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService} + err := reconciler.powerOnVM(machineContext) + Expect(err).NotTo(HaveOccurred()) + Expect(logBuffer.String()).To(ContainSubstring("Insufficient memory detected for the ELF cluster")) + + task := fake.NewTowerTask() + mockVMService.EXPECT().PowerOn(elfMachine.Status.VMRef).Return(task, nil) + expireELFScheduleVMError(machineContext, clusterKey) + err = reconciler.powerOnVM(machineContext) + Expect(err).NotTo(HaveOccurred()) + Expect(logBuffer.String()).To(ContainSubstring("and the retry silence period passes, will try to power on the VM again")) + expectConditions(elfMachine, []conditionAssertion{{infrav1.VMProvisionedCondition, corev1.ConditionFalse, clusterv1.ConditionSeverityInfo, infrav1.PoweringOnReason}}) + resetClusterResourceMap() + }) + }) }) Context("Reconcile Join Placement Group", func() { @@ -2646,6 +2690,53 @@ var _ = Describe("ElfMachineReconciler", func() { Expect(strings.Contains(err.Error(), "failed to get task")).To(BeTrue()) Expect(elfMachine.Status.TaskRef).To(Equal(*task.ID)) }) + + It("should handle failed/succeeded task", func() { + resetClusterResourceMap() + task := fake.NewTowerTask() + task.Status = models.NewTaskStatus(models.TaskStatusFAILED) + task.ErrorMessage = service.TowerString(service.MemoryInsufficientError) + elfMachine.Status.TaskRef = *task.ID + ctrlContext := newCtrlContexts(elfCluster, cluster, elfMachine, machine, secret, md) + fake.InitOwnerReferences(ctrlContext, elfCluster, cluster, elfMachine, machine) + machineContext := newMachineContext(ctrlContext, elfCluster, cluster, elfMachine, machine, mockVMService) + machineContext.VMService = mockVMService + mockVMService.EXPECT().GetTask(elfMachine.Status.TaskRef).AnyTimes().Return(task, nil) + + reconciler := &ElfMachineReconciler{ControllerContext: ctrlContext, NewVMService: mockNewVMService} + ok, err := reconciler.reconcileVMTask(machineContext, nil) + Expect(ok).Should(BeTrue()) + Expect(err.Error()).To(ContainSubstring("Insufficient memory detected for the ELF cluster")) + Expect(elfMachine.Status.TaskRef).To(Equal("")) + Expect(logBuffer.String()).To(ContainSubstring("VM task failed")) + + logBuffer = new(bytes.Buffer) + klog.SetOutput(logBuffer) + task.ErrorMessage = service.TowerString(service.PlacementGroupMustError) + elfMachine.Status.TaskRef = *task.ID + ok, err = reconciler.reconcileVMTask(machineContext, nil) + Expect(ok).Should(BeTrue()) + Expect(err.Error()).To(ContainSubstring("The placement group policy can not be satisfied")) + Expect(logBuffer.String()).To(ContainSubstring("VM task failed")) + + ok, msg, err := isELFScheduleVMErrorRecorded(machineContext) + Expect(ok).To(BeTrue()) + Expect(msg).To(ContainSubstring("Insufficient memory detected for the ELF cluster")) + Expect(err).ShouldNot(HaveOccurred()) + + task.Status = models.NewTaskStatus(models.TaskStatusSUCCESSED) + task.Description = service.TowerString("Start VM") + elfMachine.Status.TaskRef = *task.ID + ok, err = reconciler.reconcileVMTask(machineContext, nil) + Expect(ok).Should(BeTrue()) + Expect(err).ShouldNot(HaveOccurred()) + Expect(logBuffer.String()).To(ContainSubstring("VM task succeeded")) + + ok, msg, err = isELFScheduleVMErrorRecorded(machineContext) + Expect(ok).To(BeFalse()) + Expect(msg).To(Equal("")) + Expect(err).ShouldNot(HaveOccurred()) + }) }) Context("Reconcile Node", func() { diff --git a/controllers/tower_cache.go b/controllers/tower_cache.go index d8ef9fc6..6a319f22 100644 --- a/controllers/tower_cache.go +++ b/controllers/tower_cache.go @@ -17,78 +17,119 @@ limitations under the License. package controllers import ( + "fmt" "sync" "time" + + "github.com/smartxworks/cluster-api-provider-elf/pkg/context" + towerresources "github.com/smartxworks/cluster-api-provider-elf/pkg/resources" ) const ( silenceTime = time.Minute * 5 ) -var clusterStatusMap = make(map[string]*clusterStatus) +var clusterResourceMap = make(map[string]*clusterResource) var lock sync.RWMutex -type clusterStatus struct { - Resources resources -} - -type resources struct { - IsMemoryInsufficient bool - // LastDetected records the last memory detection time +type clusterResource struct { + // IsUnmet indicates whether the resource does not meet the requirement. + // For example, true can indicate insufficient memory and not satisfy placement group policy. + IsUnmet bool + // LastDetected records the last resource detection time LastDetected time.Time - // LastRetried records the time of the last attempt to detect memory + // LastRetried records the time of the last attempt to detect resource LastRetried time.Time } -// isElfClusterMemoryInsufficient returns whether the ELF cluster has insufficient memory. -func isElfClusterMemoryInsufficient(clusterID string) bool { +// isELFScheduleVMErrorRecorded returns whether the ELF cluster has failed scheduling virtual machine errors. +// +// Includes these scenarios: +// 1. ELF cluster has insufficient memory. +// 2. Placement group not satisfy policy. +func isELFScheduleVMErrorRecorded(ctx *context.MachineContext) (bool, string, error) { lock.RLock() defer lock.RUnlock() - if status, ok := clusterStatusMap[clusterID]; ok { - return status.Resources.IsMemoryInsufficient + if resource, ok := clusterResourceMap[getMemoryKey(ctx.ElfCluster.Spec.Cluster)]; ok && resource.IsUnmet { + return true, fmt.Sprintf("Insufficient memory detected for the ELF cluster %s", ctx.ElfCluster.Spec.Cluster), nil } - return false + placementGroupName, err := towerresources.GetVMPlacementGroupName(ctx, ctx.Client, ctx.Machine, ctx.Cluster) + if err != nil { + return false, "", err + } + + if resource, ok := clusterResourceMap[getPlacementGroupKey(placementGroupName)]; ok && resource.IsUnmet { + return true, fmt.Sprintf("Not satisfy policy detected for the placement group %s", placementGroupName), nil + } + + return false, "", nil } -// setElfClusterMemoryInsufficient sets whether the memory is insufficient. -func setElfClusterMemoryInsufficient(clusterID string, isInsufficient bool) { +// recordElfClusterMemoryInsufficient records whether the memory is insufficient. +func recordElfClusterMemoryInsufficient(ctx *context.MachineContext, isInsufficient bool) { lock.Lock() defer lock.Unlock() - now := time.Now() - resources := resources{ - IsMemoryInsufficient: isInsufficient, - LastDetected: now, - LastRetried: now, + clusterResourceMap[getMemoryKey(ctx.ElfCluster.Spec.Cluster)] = newClusterResource(isInsufficient) +} + +// recordPlacementGroupPolicyNotSatisfied records whether the placement group not satisfy policy. +func recordPlacementGroupPolicyNotSatisfied(ctx *context.MachineContext, isNotSatisfiedPolicy bool) error { + lock.Lock() + defer lock.Unlock() + + placementGroupName, err := towerresources.GetVMPlacementGroupName(ctx, ctx.Client, ctx.Machine, ctx.Cluster) + if err != nil { + return err } - if status, ok := clusterStatusMap[clusterID]; ok { - status.Resources = resources - } else { - clusterStatusMap[clusterID] = &clusterStatus{Resources: resources} + clusterResourceMap[getPlacementGroupKey(placementGroupName)] = newClusterResource(isNotSatisfiedPolicy) + + return nil +} + +func newClusterResource(isUnmet bool) *clusterResource { + now := time.Now() + return &clusterResource{ + IsUnmet: isUnmet, + LastDetected: now, + LastRetried: now, } } // canRetryVMOperation returns whether virtual machine operations(Create/PowerOn) // can be performed. -func canRetryVMOperation(clusterID string) bool { +func canRetryVMOperation(ctx *context.MachineContext) (bool, error) { lock.Lock() defer lock.Unlock() - if status, ok := clusterStatusMap[clusterID]; ok { - if !status.Resources.IsMemoryInsufficient { + if ok := canRetry(getMemoryKey(ctx.ElfCluster.Spec.Cluster)); ok { + return true, nil + } + + placementGroupName, err := towerresources.GetVMPlacementGroupName(ctx, ctx.Client, ctx.Machine, ctx.Cluster) + if err != nil { + return false, err + } + + return canRetry(getPlacementGroupKey(placementGroupName)), nil +} + +func canRetry(key string) bool { + if resource, ok := clusterResourceMap[key]; ok { + if !resource.IsUnmet { return false } - if time.Now().Before(status.Resources.LastDetected.Add(silenceTime)) { + if time.Now().Before(resource.LastDetected.Add(silenceTime)) { return false } else { - if time.Now().Before(status.Resources.LastRetried.Add(silenceTime)) { + if time.Now().Before(resource.LastRetried.Add(silenceTime)) { return false } else { - status.Resources.LastRetried = time.Now() + resource.LastRetried = time.Now() return true } } @@ -96,3 +137,11 @@ func canRetryVMOperation(clusterID string) bool { return false } + +func getMemoryKey(clusterID string) string { + return fmt.Sprintf("%s-memory", clusterID) +} + +func getPlacementGroupKey(placementGroup string) string { + return fmt.Sprintf("%s-placement-group", placementGroup) +} diff --git a/controllers/tower_cache_test.go b/controllers/tower_cache_test.go index f3a52cb5..cb6e7a74 100644 --- a/controllers/tower_cache_test.go +++ b/controllers/tower_cache_test.go @@ -17,86 +17,169 @@ limitations under the License. package controllers import ( + "strings" + . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" + "github.com/smartxworks/cluster-api-provider-elf/pkg/context" + towerresources "github.com/smartxworks/cluster-api-provider-elf/pkg/resources" "github.com/smartxworks/cluster-api-provider-elf/test/fake" ) -var _ = Describe("TowerCache", func() { - var clusterID string +const ( + clusterKey = "clusterID" + placementGroupKey = "getPlacementGroupName" +) +var _ = Describe("TowerCache", func() { BeforeEach(func() { - clusterID = fake.UUID() - resetClusterStatusMap() + resetClusterResourceMap() }) - It("should set memoryInsufficient", func() { - Expect(clusterStatusMap).NotTo(HaveKey(clusterID)) - Expect(clusterStatusMap[clusterID]).To(BeNil()) - - setElfClusterMemoryInsufficient(clusterID, true) - Expect(clusterStatusMap[clusterID].Resources.IsMemoryInsufficient).To(BeTrue()) - Expect(clusterStatusMap[clusterID].Resources.LastDetected).To(Equal(clusterStatusMap[clusterID].Resources.LastRetried)) - - setElfClusterMemoryInsufficient(clusterID, true) - Expect(clusterStatusMap[clusterID].Resources.IsMemoryInsufficient).To(BeTrue()) - Expect(clusterStatusMap[clusterID].Resources.LastDetected).To(Equal(clusterStatusMap[clusterID].Resources.LastRetried)) - - setElfClusterMemoryInsufficient(clusterID, false) - Expect(clusterStatusMap[clusterID].Resources.IsMemoryInsufficient).To(BeFalse()) - Expect(clusterStatusMap[clusterID].Resources.LastDetected).To(Equal(clusterStatusMap[clusterID].Resources.LastRetried)) - - resetClusterStatusMap() - Expect(clusterStatusMap).NotTo(HaveKey(clusterID)) - Expect(clusterStatusMap[clusterID]).To(BeNil()) - - setElfClusterMemoryInsufficient(clusterID, false) - Expect(clusterStatusMap[clusterID].Resources.IsMemoryInsufficient).To(BeFalse()) - Expect(clusterStatusMap[clusterID].Resources.LastDetected).To(Equal(clusterStatusMap[clusterID].Resources.LastRetried)) - - setElfClusterMemoryInsufficient(clusterID, false) - Expect(clusterStatusMap[clusterID].Resources.IsMemoryInsufficient).To(BeFalse()) - Expect(clusterStatusMap[clusterID].Resources.LastDetected).To(Equal(clusterStatusMap[clusterID].Resources.LastRetried)) - - setElfClusterMemoryInsufficient(clusterID, true) - Expect(clusterStatusMap[clusterID].Resources.IsMemoryInsufficient).To(BeTrue()) - Expect(clusterStatusMap[clusterID].Resources.LastDetected).To(Equal(clusterStatusMap[clusterID].Resources.LastRetried)) + It("should set memoryInsufficient/policyNotSatisfied", func() { + for _, name := range []string{clusterKey, placementGroupKey} { + resetClusterResourceMap() + elfCluster, cluster, elfMachine, machine, secret := fake.NewClusterAndMachineObjects() + elfCluster.Spec.Cluster = name + md := fake.NewMD() + md.Name = name + fake.ToWorkerMachine(machine, md) + fake.ToWorkerMachine(elfMachine, md) + ctrlContext := newCtrlContexts(elfCluster, cluster, elfMachine, machine, secret, md) + machineContext := newMachineContext(ctrlContext, elfCluster, cluster, elfMachine, machine, nil) + key := getKey(machineContext, name) + + Expect(clusterResourceMap).NotTo(HaveKey(key)) + Expect(clusterResourceMap[key]).To(BeNil()) + + recordIsUnmet(machineContext, name, true) + Expect(clusterResourceMap[key].IsUnmet).To(BeTrue()) + Expect(clusterResourceMap[key].LastDetected).To(Equal(clusterResourceMap[key].LastRetried)) + + recordIsUnmet(machineContext, name, true) + Expect(clusterResourceMap[key].IsUnmet).To(BeTrue()) + Expect(clusterResourceMap[key].LastDetected).To(Equal(clusterResourceMap[key].LastRetried)) + + recordIsUnmet(machineContext, name, false) + Expect(clusterResourceMap[key].IsUnmet).To(BeFalse()) + Expect(clusterResourceMap[key].LastDetected).To(Equal(clusterResourceMap[key].LastRetried)) + + resetClusterResourceMap() + Expect(clusterResourceMap).NotTo(HaveKey(name)) + Expect(clusterResourceMap[key]).To(BeNil()) + + recordIsUnmet(machineContext, name, false) + Expect(clusterResourceMap[key].IsUnmet).To(BeFalse()) + Expect(clusterResourceMap[key].LastDetected).To(Equal(clusterResourceMap[key].LastRetried)) + + recordIsUnmet(machineContext, name, false) + Expect(clusterResourceMap[key].IsUnmet).To(BeFalse()) + Expect(clusterResourceMap[key].LastDetected).To(Equal(clusterResourceMap[key].LastRetried)) + + recordIsUnmet(machineContext, name, true) + Expect(clusterResourceMap[key].IsUnmet).To(BeTrue()) + Expect(clusterResourceMap[key].LastDetected).To(Equal(clusterResourceMap[key].LastRetried)) + } }) - It("should return whether memory is insufficient", func() { - Expect(clusterStatusMap).NotTo(HaveKey(clusterID)) - Expect(clusterStatusMap[clusterID]).To(BeNil()) - - Expect(isElfClusterMemoryInsufficient(clusterID)).To(BeFalse()) - - setElfClusterMemoryInsufficient(clusterID, false) - Expect(isElfClusterMemoryInsufficient(clusterID)).To(BeFalse()) + It("should return whether need to detect", func() { + for _, name := range []string{clusterKey, placementGroupKey} { + resetClusterResourceMap() + elfCluster, cluster, elfMachine, machine, secret := fake.NewClusterAndMachineObjects() + elfCluster.Spec.Cluster = name + md := fake.NewMD() + md.Name = name + fake.ToWorkerMachine(machine, md) + fake.ToWorkerMachine(elfMachine, md) + ctrlContext := newCtrlContexts(elfCluster, cluster, elfMachine, machine, secret, md) + machineContext := newMachineContext(ctrlContext, elfCluster, cluster, elfMachine, machine, nil) + key := getKey(machineContext, name) + + Expect(clusterResourceMap).NotTo(HaveKey(key)) + Expect(clusterResourceMap[key]).To(BeNil()) + ok, err := canRetryVMOperation(machineContext) + Expect(ok).To(BeFalse()) + Expect(err).ShouldNot(HaveOccurred()) + + recordIsUnmet(machineContext, name, false) + ok, err = canRetryVMOperation(machineContext) + Expect(ok).To(BeFalse()) + Expect(err).ShouldNot(HaveOccurred()) + + recordIsUnmet(machineContext, name, true) + ok, err = canRetryVMOperation(machineContext) + Expect(ok).To(BeFalse()) + Expect(err).ShouldNot(HaveOccurred()) + + expireELFScheduleVMError(machineContext, name) + ok, err = canRetryVMOperation(machineContext) + Expect(ok).To(BeTrue()) + Expect(err).ShouldNot(HaveOccurred()) + + ok, err = canRetryVMOperation(machineContext) + Expect(ok).To(BeFalse()) + Expect(err).ShouldNot(HaveOccurred()) + } + }) - setElfClusterMemoryInsufficient(clusterID, true) - Expect(isElfClusterMemoryInsufficient(clusterID)).To(BeTrue()) + It("isELFScheduleVMErrorRecorded", func() { + resetClusterResourceMap() + elfCluster, cluster, elfMachine, machine, secret := fake.NewClusterAndMachineObjects() + elfCluster.Spec.Cluster = clusterKey + md := fake.NewMD() + md.Name = placementGroupKey + fake.ToWorkerMachine(machine, md) + fake.ToWorkerMachine(elfMachine, md) + ctrlContext := newCtrlContexts(elfCluster, cluster, elfMachine, machine, secret, md) + machineContext := newMachineContext(ctrlContext, elfCluster, cluster, elfMachine, machine, nil) + + ok, msg, err := isELFScheduleVMErrorRecorded(machineContext) + Expect(ok).To(BeFalse()) + Expect(msg).To(Equal("")) + Expect(err).ShouldNot(HaveOccurred()) + + recordIsUnmet(machineContext, clusterKey, true) + ok, msg, err = isELFScheduleVMErrorRecorded(machineContext) + Expect(ok).To(BeTrue()) + Expect(msg).To(ContainSubstring("Insufficient memory detected for the ELF cluster")) + Expect(err).ShouldNot(HaveOccurred()) + + resetClusterResourceMap() + recordIsUnmet(machineContext, placementGroupKey, true) + ok, msg, err = isELFScheduleVMErrorRecorded(machineContext) + Expect(ok).To(BeTrue()) + Expect(msg).To(ContainSubstring("Not satisfy policy detected for the placement group")) + Expect(err).ShouldNot(HaveOccurred()) }) +}) - It("should return whether need to detect", func() { - Expect(clusterStatusMap).NotTo(HaveKey(clusterID)) - Expect(clusterStatusMap[clusterID]).To(BeNil()) +func getKey(ctx *context.MachineContext, name string) string { + if name == clusterKey { + return getMemoryKey(name) + } - Expect(canRetryVMOperation(clusterID)).To(BeFalse()) + placementGroupName, err := towerresources.GetVMPlacementGroupName(ctx, ctx.Client, ctx.Machine, ctx.Cluster) + Expect(err).ShouldNot(HaveOccurred()) - setElfClusterMemoryInsufficient(clusterID, false) - Expect(canRetryVMOperation(clusterID)).To(BeFalse()) + return getPlacementGroupKey(placementGroupName) +} - setElfClusterMemoryInsufficient(clusterID, true) - Expect(canRetryVMOperation(clusterID)).To(BeFalse()) +func recordIsUnmet(ctx *context.MachineContext, key string, isUnmet bool) { + if strings.Contains(key, clusterKey) { + recordElfClusterMemoryInsufficient(ctx, isUnmet) + return + } - clusterStatusMap[clusterID].Resources.LastDetected = clusterStatusMap[clusterID].Resources.LastDetected.Add(-silenceTime) - clusterStatusMap[clusterID].Resources.LastRetried = clusterStatusMap[clusterID].Resources.LastRetried.Add(-silenceTime) - Expect(canRetryVMOperation(clusterID)).To(BeTrue()) + Expect(recordPlacementGroupPolicyNotSatisfied(ctx, isUnmet)).ShouldNot(HaveOccurred()) +} - Expect(canRetryVMOperation(clusterID)).To(BeFalse()) - }) -}) +func expireELFScheduleVMError(ctx *context.MachineContext, name string) { + key := getKey(ctx, name) + clusterResourceMap[key].LastDetected = clusterResourceMap[key].LastDetected.Add(-silenceTime) + clusterResourceMap[key].LastRetried = clusterResourceMap[key].LastRetried.Add(-silenceTime) +} -func resetClusterStatusMap() { - clusterStatusMap = make(map[string]*clusterStatus) +func resetClusterResourceMap() { + clusterResourceMap = make(map[string]*clusterResource) } diff --git a/pkg/service/errors.go b/pkg/service/errors.go index 4114ecaf..b55122ae 100644 --- a/pkg/service/errors.go +++ b/pkg/service/errors.go @@ -36,6 +36,9 @@ const ( LabelAddFailed = "LABEL_ADD_FAILED" CloudInitError = "VM_CLOUD_INIT_CONFIG_ERROR" MemoryInsufficientError = "HostAvailableMemoryFilter" + PlacementGroupError = "PlacementGroupFilter" // SMTX OS <= 5.0.4 + PlacementGroupMustError = "PlacementGroupMustFilter" + PlacementGroupPriorError = "PlacementGroupPriorFilter" ) func IsVMNotFound(err error) bool { @@ -86,3 +89,17 @@ func FormatCloudInitError(message string) string { func IsMemoryInsufficientError(message string) bool { return strings.Contains(message, MemoryInsufficientError) } + +func IsPlacementGroupError(message string) bool { + return strings.Contains(message, PlacementGroupError) || + IsPlacementGroupMustError(message) || + IsPlacementGroupPriorError(message) +} + +func IsPlacementGroupMustError(message string) bool { + return strings.Contains(message, PlacementGroupMustError) +} + +func IsPlacementGroupPriorError(message string) bool { + return strings.Contains(message, PlacementGroupPriorError) +}