From b7253431d0d5766bbd4c2c203bb430b13f5374fe Mon Sep 17 00:00:00 2001 From: Hardik Dodiya Date: Mon, 5 Oct 2020 18:12:49 +0530 Subject: [PATCH] Introduce constant backoff while enqueuing machine objects on creation and deletion failures Co-authored-by: Prashanth --- pkg/apis/machine/types.go | 3 + pkg/apis/machine/v1alpha1/machine_types.go | 5 +- pkg/controller/controller_utils.go | 13 +- pkg/controller/deployment_machineset_util.go | 2 +- pkg/controller/machine.go | 131 +++++++++++++++---- pkg/controller/machine_test.go | 130 +++++++++++++++++- pkg/controller/metrics.go | 2 + 7 files changed, 250 insertions(+), 36 deletions(-) diff --git a/pkg/apis/machine/types.go b/pkg/apis/machine/types.go index 7a9f82340..2e9fc99a9 100644 --- a/pkg/apis/machine/types.go +++ b/pkg/apis/machine/types.go @@ -222,6 +222,9 @@ const ( // MachineFailed means operation failed leading to machine status failure MachineFailed MachinePhase = "Failed" + + // MachineCrashLoopBackOff means creation or deletion of the machine is failing. + MachineCrashLoopBackOff MachinePhase = "CrashLoopBackOff" ) // MachinePhase is a label for the condition of a machines at the current time. diff --git a/pkg/apis/machine/v1alpha1/machine_types.go b/pkg/apis/machine/v1alpha1/machine_types.go index 9cb85438f..a311d2f87 100644 --- a/pkg/apis/machine/v1alpha1/machine_types.go +++ b/pkg/apis/machine/v1alpha1/machine_types.go @@ -146,9 +146,12 @@ const ( // MachineFailed means operation failed leading to machine status failure MachineFailed MachinePhase = "Failed" + + // MachineCrashLoopBackOff means creation or deletion of the machine is failing. + MachineCrashLoopBackOff MachinePhase = "CrashLoopBackOff" ) -// MachinePhase is a label for the condition of a machines at the current time. +// MachineState is a current state of the machine. type MachineState string // These are the valid statuses of machines. diff --git a/pkg/controller/controller_utils.go b/pkg/controller/controller_utils.go index d3c2669a9..e4f05d2ab 100644 --- a/pkg/controller/controller_utils.go +++ b/pkg/controller/controller_utils.go @@ -767,12 +767,13 @@ func (s ActiveMachines) Less(i, j int) bool { // the lower the priority, the more likely // it is to be deleted m := map[v1alpha1.MachinePhase]int{ - v1alpha1.MachineTerminating: 0, - v1alpha1.MachineFailed: 1, - v1alpha1.MachineUnknown: 2, - v1alpha1.MachinePending: 3, - v1alpha1.MachineAvailable: 4, - v1alpha1.MachineRunning: 5, + v1alpha1.MachineTerminating: 0, + v1alpha1.MachineFailed: 1, + v1alpha1.MachineCrashLoopBackOff: 2, + v1alpha1.MachineUnknown: 3, + v1alpha1.MachinePending: 4, + v1alpha1.MachineAvailable: 5, + v1alpha1.MachineRunning: 6, } // Case-1: Initially we try to prioritize machine deletion based on diff --git a/pkg/controller/deployment_machineset_util.go b/pkg/controller/deployment_machineset_util.go index 2ca7078cb..275b57438 100644 --- a/pkg/controller/deployment_machineset_util.go +++ b/pkg/controller/deployment_machineset_util.go @@ -93,7 +93,7 @@ func calculateMachineSetStatus(is *v1alpha1.MachineSet, filteredMachines []*v1al // Count the number of machines that have labels matching the labels of the machine // template of the machine set, the matching machines may have more // labels than are in the template. Because the label of machineTemplateSpec is - // a supeiset of the selector of the machine set, so the possible + // a superset of the selector of the machine set, so the possible // matching machines must be part of the filteredmachines. fullyLabeledReplicasCount := 0 readyReplicasCount := 0 diff --git a/pkg/controller/machine.go b/pkg/controller/machine.go index 7e792a148..1203ad77b 100644 --- a/pkg/controller/machine.go +++ b/pkg/controller/machine.go @@ -27,12 +27,14 @@ import ( "k8s.io/klog" + corev1 "k8s.io/api/core/v1" v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/selection" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/client-go/tools/cache" + apiequality "k8s.io/apimachinery/pkg/api/equality" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -48,6 +50,9 @@ const ( // priority the more likely it is to be deleted first // Default priority for a machine is set to 3 MachinePriority = "machinepriority.machine.sapcloud.io" + + // MachineEnqueueRetryPeriod period after which a machine object is re-enqueued upon creation or deletion failures. + MachineEnqueueRetryPeriod = 2 * time.Minute ) /* @@ -158,7 +163,8 @@ func (c *controller) reconcileClusterMachine(machine *v1alpha1.Machine) error { // Validate MachineClass MachineClass, secretRef, err := c.validateMachineClass(&machine.Spec.Class) if err != nil || secretRef == nil { - return err + c.enqueueMachineAfter(machine, MachineEnqueueRetryPeriod) + return nil } driver := driver.NewDriver(machine.Spec.ProviderID, secretRef, machine.Spec.Class.Kind, MachineClass, machine.Name) @@ -200,7 +206,8 @@ func (c *controller) reconcileClusterMachine(machine *v1alpha1.Machine) error { if machine.DeletionTimestamp != nil { // Processing of delete event if err := c.machineDelete(machine, driver); err != nil { - return err + c.enqueueMachineAfter(machine, MachineEnqueueRetryPeriod) + return nil } } else if machine.Status.CurrentStatus.TimeoutActive { // Processing machine @@ -213,7 +220,8 @@ func (c *controller) reconcileClusterMachine(machine *v1alpha1.Machine) error { return nil } else if actualProviderID == "" { if err := c.machineCreate(machine, driver); err != nil { - return err + c.enqueueMachineAfter(machine, MachineEnqueueRetryPeriod) + return nil } } else if actualProviderID != machine.Spec.ProviderID { if err := c.machineUpdate(machine, actualProviderID); err != nil { @@ -231,6 +239,12 @@ func (c *controller) reconcileClusterMachine(machine *v1alpha1.Machine) error { */ func (c *controller) addNodeToMachine(obj interface{}) { + node := obj.(*corev1.Node) + if node == nil { + klog.Errorf("Couldn't convert to node from object") + return + } + key, err := cache.DeletionHandlingMetaNamespaceKeyFunc(obj) if err != nil { klog.Errorf("Couldn't get key for object %+v: %v", obj, err) @@ -245,8 +259,10 @@ func (c *controller) addNodeToMachine(obj interface{}) { return } - klog.V(4).Infof("Add machine object backing node %q", machine.Name) - c.enqueueMachine(machine) + if machine.Status.CurrentStatus.Phase != v1alpha1.MachineCrashLoopBackOff && nodeConditionsHaveChanged(machine.Status.Conditions, node.Status.Conditions) { + klog.V(2).Infof("Enqueue machine object %q as backing node's conditions have changed", machine.Name) + c.enqueueMachine(machine) + } } func (c *controller) updateNodeToMachine(oldObj, newObj interface{}) { @@ -254,7 +270,28 @@ func (c *controller) updateNodeToMachine(oldObj, newObj interface{}) { } func (c *controller) deleteNodeToMachine(obj interface{}) { - c.addNodeToMachine(obj) + + node := obj.(*corev1.Node) + if node == nil { + klog.Errorf("Couldn't convert to node from object") + return + } + + key, err := cache.DeletionHandlingMetaNamespaceKeyFunc(obj) + if err != nil { + klog.Errorf("Couldn't get key for object %+v: %v", obj, err) + return + } + + machine, err := c.getMachineFromNode(key) + if err != nil { + klog.Errorf("Couldn't fetch machine %s, Error: %s", key, err) + return + } else if machine == nil { + return + } + + c.enqueueMachine(machine) } /* @@ -411,6 +448,26 @@ func (c *controller) machineCreate(machine *v1alpha1.Machine, driver driver.Driv // Before actually creating the machine, we should once check and adopt if the virtual machine already exists. VMList, err := driver.GetVMs("") if err != nil { + + klog.Errorf("Error while listing machine %s: %s", machine.Name, err.Error()) + lastOperation := v1alpha1.LastOperation{ + Description: "Cloud provider message - " + err.Error(), + State: v1alpha1.MachineStateFailed, + Type: v1alpha1.MachineOperationCreate, + LastUpdateTime: metav1.Now(), + } + currentStatus := v1alpha1.CurrentStatus{ + Phase: v1alpha1.MachineCrashLoopBackOff, + TimeoutActive: false, + LastUpdateTime: metav1.Now(), + } + c.updateMachineStatus(machine, lastOperation, currentStatus) + + // Delete the bootstrap token + if err := c.deleteBootstrapToken(machine.Name); err != nil { + klog.Warning(err) + } + klog.Errorf("Failed to list VMs before creating machine %q %+v", machine.Name, err) return err } @@ -433,7 +490,7 @@ func (c *controller) machineCreate(machine *v1alpha1.Machine, driver driver.Driv LastUpdateTime: metav1.Now(), } currentStatus := v1alpha1.CurrentStatus{ - Phase: v1alpha1.MachineFailed, + Phase: v1alpha1.MachineCrashLoopBackOff, TimeoutActive: false, LastUpdateTime: metav1.Now(), } @@ -692,22 +749,23 @@ func (c *controller) machineDelete(machine *v1alpha1.Machine, driver driver.Driv klog.V(2).Infof("Machine %q on deletion doesn't have a providerID attached to it. Checking for any VM linked to this machine object.", machine.Name) // As MachineID is missing, we should check once if actual VM was created but MachineID was not updated on machine-object. // We list VMs and check if any one them map with the given machine-object. - VMList, err := driver.GetVMs("") - if err != nil { - klog.Errorf("Failed to list VMs while deleting the machine %q %v", machine.Name, err) - return err - } - for providerID, machineName := range VMList { - if machineName == machine.Name { - klog.V(2).Infof("Deleting the VM %s backing the machine-object %s.", providerID, machine.Name) - err = driver.Delete(providerID) - if err != nil { - klog.Errorf("Error deleting the VM %s backing the machine-object %s due to error %v", providerID, machine.Name, err) - // Not returning error so that status on the machine object can be updated in the next step if errored. - } else { - klog.V(2).Infof("VM %s backing the machine-object %s is deleted successfully", providerID, machine.Name) + var VMList map[string]string + VMList, err = driver.GetVMs("") + if err == nil { + for providerID, machineName := range VMList { + if machineName == machine.Name { + klog.V(2).Infof("Deleting the VM %s backing the machine-object %s.", providerID, machine.Name) + err = driver.Delete(providerID) + if err != nil { + klog.Errorf("Error deleting the VM %s backing the machine-object %s due to error %v", providerID, machine.Name, err) + // Not returning error so that status on the machine object can be updated in the next step if errored. + } else { + klog.V(2).Infof("VM %s backing the machine-object %s is deleted successfully", providerID, machine.Name) + } } } + } else { + klog.Errorf("Failed to list VMs while deleting the machine %q %v", machine.Name, err) } } @@ -722,7 +780,7 @@ func (c *controller) machineDelete(machine *v1alpha1.Machine, driver driver.Driv LastUpdateTime: metav1.Now(), } currentStatus := v1alpha1.CurrentStatus{ - Phase: v1alpha1.MachineFailed, + Phase: v1alpha1.MachineTerminating, TimeoutActive: false, LastUpdateTime: metav1.Now(), } @@ -775,14 +833,18 @@ func (c *controller) updateMachineStatus( currentStatus v1alpha1.CurrentStatus, ) (*v1alpha1.Machine, error) { // Get the latest version of the machine so that we can avoid conflicts - clone, err := c.controlMachineClient.Machines(machine.Namespace).Get(machine.Name, metav1.GetOptions{}) + latestMachine, err := c.controlMachineClient.Machines(machine.Namespace).Get(machine.Name, metav1.GetOptions{}) if err != nil { - return machine, err + return nil, err } + clone := latestMachine.DeepCopy() - clone = clone.DeepCopy() clone.Status.LastOperation = lastOperation clone.Status.CurrentStatus = currentStatus + if isMachineStatusEqual(clone.Status, machine.Status) { + klog.V(3).Infof("Not updating the status of the machine object %q , as it is already same", clone.Name) + return machine, nil + } clone, err = c.controlMachineClient.Machines(clone.Namespace).UpdateStatus(clone) if err != nil { @@ -793,6 +855,25 @@ func (c *controller) updateMachineStatus( return clone, nil } +// isMachineStatusEqual checks if the status of 2 machines is similar or not. +func isMachineStatusEqual(s1, s2 v1alpha1.MachineStatus) bool { + tolerateTimeDiff := 30 * time.Minute + s1Copy, s2Copy := s1.DeepCopy(), s2.DeepCopy() + s1Copy.LastOperation.Description, s2Copy.LastOperation.Description = "", "" + + if (s1Copy.LastOperation.LastUpdateTime.Time.Before(time.Now().Add(tolerateTimeDiff * -1))) || (s2Copy.LastOperation.LastUpdateTime.Time.Before(time.Now().Add(tolerateTimeDiff * -1))) { + return false + } + s1Copy.LastOperation.LastUpdateTime, s2Copy.LastOperation.LastUpdateTime = metav1.Time{}, metav1.Time{} + + if (s1Copy.CurrentStatus.LastUpdateTime.Time.Before(time.Now().Add(tolerateTimeDiff * -1))) || (s2Copy.CurrentStatus.LastUpdateTime.Time.Before(time.Now().Add(tolerateTimeDiff * -1))) { + return false + } + s1Copy.CurrentStatus.LastUpdateTime, s2Copy.CurrentStatus.LastUpdateTime = metav1.Time{}, metav1.Time{} + + return apiequality.Semantic.DeepEqual(s1Copy.LastOperation, s2Copy.LastOperation) && apiequality.Semantic.DeepEqual(s1Copy.CurrentStatus, s2Copy.CurrentStatus) +} + func (c *controller) updateMachineConditions(machine *v1alpha1.Machine, conditions []v1.NodeCondition) (*v1alpha1.Machine, error) { var ( diff --git a/pkg/controller/machine_test.go b/pkg/controller/machine_test.go index 1dcb1b3ae..e5fb1b223 100644 --- a/pkg/controller/machine_test.go +++ b/pkg/controller/machine_test.go @@ -148,8 +148,33 @@ var _ = Describe("machine", func() { machineRet, errRet := c.updateMachineStatus(machine, lastOperation, currentStatus) Expect(errRet).Should(Not(BeNil())) Expect(errRet).Should(BeIdenticalTo(err)) - Expect(machineRet).Should(Not(BeNil())) - Expect(machineRet).Should(BeIdenticalTo(machine)) + Expect(machineRet).Should(BeNil()) + }) + + It("shouldn't update status when it is already same", func() { + machine.Status.LastOperation = lastOperation + machine.Status.CurrentStatus = currentStatus + + lastOperation = machinev1.LastOperation{ + Description: "test operation dummy", + LastUpdateTime: metav1.Now(), + State: machinev1.MachineStateProcessing, + Type: machinev1.MachineOperationCreate, + } + currentStatus = machinev1.CurrentStatus{ + LastUpdateTime: lastOperation.LastUpdateTime, + Phase: machinev1.MachinePending, + TimeoutActive: true, + } + + fakeMachineClient.AddReactor("get", "machines", func(action k8stesting.Action) (bool, runtime.Object, error) { + return true, machine, nil + }) + + machineRet, errRet := c.updateMachineStatus(machine, lastOperation, currentStatus) + Expect(errRet).Should((BeNil())) + Expect(machineRet.Status.LastOperation).Should(BeIdenticalTo(machine.Status.LastOperation)) + Expect(machineRet.Status.CurrentStatus).Should(BeIdenticalTo(machine.Status.CurrentStatus)) }) It("should return success", func() { @@ -362,6 +387,105 @@ var _ = Describe("machine", func() { ) }) + Describe("#isMachineStatusEqual", func() { + type expect struct { + equal bool + } + type action struct { + s1 machinev1.MachineStatus + s2 machinev1.MachineStatus + } + type data struct { + action action + expect expect + } + + lastOperation := machinev1.LastOperation{ + Description: "test operation", + LastUpdateTime: metav1.Now(), + State: machinev1.MachineStateProcessing, + Type: machinev1.MachineOperationCreate, + } + currentStatus := machinev1.CurrentStatus{ + LastUpdateTime: lastOperation.LastUpdateTime, + Phase: machinev1.MachinePending, + TimeoutActive: true, + } + + DescribeTable("##table", + func(data *data) { + stop := make(chan struct{}) + defer close(stop) + + equal := isMachineStatusEqual(data.action.s1, data.action.s2) + Expect(equal).To(Equal(data.expect.equal)) + }, + Entry("return true as status is same", &data{ + action: action{ + s1: machinev1.MachineStatus{ + LastOperation: lastOperation, + CurrentStatus: currentStatus, + }, + s2: machinev1.MachineStatus{ + LastOperation: lastOperation, + CurrentStatus: currentStatus, + }, + }, + expect: expect{ + equal: true, + }, + }), + Entry("return false as status is not equal", &data{ + action: action{ + s1: machinev1.MachineStatus{ + LastOperation: lastOperation, + CurrentStatus: currentStatus, + }, + s2: machinev1.MachineStatus{ + LastOperation: machinev1.LastOperation{ + Description: "test operation dummy", + LastUpdateTime: metav1.Now(), + State: machinev1.MachineStateProcessing, + Type: machinev1.MachineOperationDelete, + }, + CurrentStatus: machinev1.CurrentStatus{ + LastUpdateTime: lastOperation.LastUpdateTime, + Phase: machinev1.MachinePending, + TimeoutActive: true, + }, + }, + }, + expect: expect{ + equal: false, + }, + }), + Entry("return true as only description is not same", &data{ + action: action{ + s1: machinev1.MachineStatus{ + LastOperation: lastOperation, + CurrentStatus: currentStatus, + }, + s2: machinev1.MachineStatus{ + LastOperation: machinev1.LastOperation{ + Description: "test operation dummy dummy", + LastUpdateTime: metav1.Now(), + State: machinev1.MachineStateProcessing, + Type: machinev1.MachineOperationCreate, + }, + CurrentStatus: machinev1.CurrentStatus{ + LastUpdateTime: lastOperation.LastUpdateTime, + Phase: machinev1.MachinePending, + TimeoutActive: true, + }, + }, + }, + expect: expect{ + equal: true, + }, + }), + ) + }) + Describe("#validateMachineClass", func() { type setup struct { aws []*machinev1.AWSMachineClass @@ -744,7 +868,7 @@ var _ = Describe("machine", func() { err: false, }, }), - Entry("Orphan VM deletion on faling to find referred machine object", &data{ + Entry("Orphan VM deletion on failing to find referred machine object", &data{ setup: setup{ secrets: []*corev1.Secret{ { diff --git a/pkg/controller/metrics.go b/pkg/controller/metrics.go index 51e126128..b0bee33b5 100644 --- a/pkg/controller/metrics.go +++ b/pkg/controller/metrics.go @@ -291,6 +291,8 @@ func (c *controller) CollectMachineMetrics(ch chan<- prometheus.Metric) { phase = 2 case v1alpha1.MachineFailed: phase = 3 + case v1alpha1.MachineCrashLoopBackOff: + phase = 4 } metrics.MachineCSPhase.With(prometheus.Labels{ "name": mMeta.Name,