From d1936edab08da99f86c3696010ccf332ef3f9f0f Mon Sep 17 00:00:00 2001
From: Robin Deeboonchai <rdeeboonchai@microsoft.com>
Date: Thu, 16 Jan 2025 14:57:20 -0800
Subject: [PATCH] fix: don't crash when vmss not present or has no nodes

---
 .../cloudprovider/azure/azure_cache.go        |  2 +-
 .../cloudprovider/azure/azure_manager_test.go | 43 +++++++++++++++
 .../cloudprovider/azure/azure_scale_set.go    | 55 ++++++++++++++-----
 3 files changed, 84 insertions(+), 16 deletions(-)

diff --git a/cluster-autoscaler/cloudprovider/azure/azure_cache.go b/cluster-autoscaler/cloudprovider/azure/azure_cache.go
index 14ce9254e2c5..334048164dfc 100644
--- a/cluster-autoscaler/cloudprovider/azure/azure_cache.go
+++ b/cluster-autoscaler/cloudprovider/azure/azure_cache.go
@@ -170,7 +170,7 @@ func (m *azureCache) regenerate() error {
 		if err != nil {
 			return err
 		}
-		klog.V(4).Infof("regenerate: found nodes for node group %s: %+v", ng.Id(), instances)
+		klog.V(4).Infof("regenerate: found %d nodes for node group %s: %+v", len(instances), ng.Id(), instances)
 
 		for _, instance := range instances {
 			ref := azureRef{Name: instance.Id}
diff --git a/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go b/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go
index b8262a81348a..0733982e2653 100644
--- a/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go
+++ b/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go
@@ -1201,6 +1201,49 @@ func TestGetScaleSetOptions(t *testing.T) {
 	assert.Equal(t, *opts, defaultOptions)
 }
 
+// TestVMSSNotFound ensures that AzureManager is still able to be built
+// if one nodeGroup (VMSS) is not found. Previously, we would fail on manager creation
+// if even one expected nodeGroup was not found. When manager creation errored out,
+// BuildAzure returns log.Fatalf() which caused CAS to crash.
+func TestVMSSNotFound(t *testing.T) {
+	// client setup
+	ctrl := gomock.NewController(t)
+	defer ctrl.Finish()
+	mockVMSSClient := mockvmssclient.NewMockInterface(ctrl)
+	mockVMClient := mockvmclient.NewMockInterface(ctrl)
+	mockVMSSVMClient := mockvmssvmclient.NewMockInterface(ctrl)
+	client := azClient{}
+	client.virtualMachineScaleSetsClient = mockVMSSClient
+	client.virtualMachinesClient = mockVMClient
+	client.virtualMachineScaleSetVMsClient = mockVMSSVMClient
+
+	// Expect that no vmss are present in the vmss client
+	mockVMSSVMClient.EXPECT().List(gomock.Any(), "fakeId", testASG, gomock.Any()).Return([]compute.VirtualMachineScaleSetVM{}, nil).AnyTimes()
+	mockVMClient.EXPECT().List(gomock.Any(), "fakeId").Return([]compute.VirtualMachine{}, nil).AnyTimes()
+	mockVMSSClient.EXPECT().List(gomock.Any(), "fakeId").Return([]compute.VirtualMachineScaleSet{}, nil).AnyTimes()
+
+	// Add explicit node group to look for during init
+	ngdo := cloudprovider.NodeGroupDiscoveryOptions{
+		NodeGroupSpecs: []string{
+			fmt.Sprintf("%d:%d:%s", 1, 3, testASG),
+		},
+	}
+
+	// We expect the initial BuildAzure flow to pass when a NodeGroup is detected
+	// that doesn't have a corresponding VMSS in the cache.
+	t.Run("should not error when VMSS not found in cache", func(t *testing.T) {
+		manager, err := createAzureManagerInternal(strings.NewReader(validAzureCfg), ngdo, &client)
+		assert.NoError(t, err)
+		// expect one nodegroup to be present
+		nodeGroups := manager.getNodeGroups()
+		assert.Len(t, nodeGroups, 1)
+		assert.Equal(t, nodeGroups[0].Id(), testASG)
+		// expect no scale sets to be present
+		scaleSets := manager.azureCache.getScaleSets()
+		assert.Len(t, scaleSets, 0)
+	})
+}
+
 func assertStructsMinimallyEqual(t *testing.T, struct1, struct2 interface{}) bool {
 	return compareStructFields(t, reflect.ValueOf(struct1), reflect.ValueOf(struct2))
 }
diff --git a/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go b/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go
index 64d5b93a2c8b..1b57dee63e0f 100644
--- a/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go
+++ b/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go
@@ -167,7 +167,12 @@ func (scaleSet *ScaleSet) Autoprovisioned() bool {
 func (scaleSet *ScaleSet) GetOptions(defaults config.NodeGroupAutoscalingOptions) (*config.NodeGroupAutoscalingOptions, error) {
 	template, err := scaleSet.getVMSSFromCache()
 	if err != nil {
-		return nil, err
+		klog.Errorf("failed to get information for VMSS: %s", scaleSet.Name)
+		// Note: We don't return an error here and instead accept defaults.
+		// Every invocation of GetOptions() returns an error if this condition is met:
+		// `if err != nil && err != cloudprovider.ErrNotImplemented`
+		// The error return value is intended to only capture unimplemented.
+		return nil, nil
 	}
 	return scaleSet.manager.GetScaleSetOptions(*template.Name, defaults), nil
 }
@@ -187,14 +192,14 @@ func (scaleSet *ScaleSet) getVMSSFromCache() (compute.VirtualMachineScaleSet, er
 	return allVMSS[scaleSet.Name], nil
 }
 
-func (scaleSet *ScaleSet) getCurSize() (int64, error) {
+func (scaleSet *ScaleSet) getCurSize() (int64, *GetVMSSFailedError) {
 	scaleSet.sizeMutex.Lock()
 	defer scaleSet.sizeMutex.Unlock()
 
 	set, err := scaleSet.getVMSSFromCache()
 	if err != nil {
 		klog.Errorf("failed to get information for VMSS: %s, error: %v", scaleSet.Name, err)
-		return -1, err
+		return -1, newGetVMSSFailedError(err, true)
 	}
 
 	// // Remove check for returning in-memory size when VMSS is in updating state
@@ -231,7 +236,7 @@ func (scaleSet *ScaleSet) getCurSize() (int64, error) {
 		set, rerr = scaleSet.manager.azClient.virtualMachineScaleSetsClient.Get(ctx, scaleSet.manager.config.ResourceGroup, scaleSet.Name)
 		if rerr != nil {
 			klog.Errorf("failed to get information for VMSS: %s, error: %v", scaleSet.Name, rerr)
-			return -1, err
+			return -1, newGetVMSSFailedError(rerr.Error(), rerr.IsNotFound())
 		}
 	}
 
@@ -253,12 +258,11 @@ func (scaleSet *ScaleSet) getCurSize() (int64, error) {
 
 // getScaleSetSize gets Scale Set size.
 func (scaleSet *ScaleSet) getScaleSetSize() (int64, error) {
-	// First, get the size of the ScaleSet reported by API
-	// -1 indiciates the ScaleSet hasn't been initialized
-	size, err := scaleSet.getCurSize()
-	if size == -1 || err != nil {
-		klog.V(3).Infof("getScaleSetSize: either size is -1 (actual: %d) or error exists (actual err:%v)", size, err)
-		return size, err
+	// First, get the current size of the ScaleSet
+	size, getVMSSError := scaleSet.getCurSize()
+	if size == -1 || getVMSSError != nil {
+		klog.V(3).Infof("getScaleSetSize: either size is -1 (actual: %d) or error exists (actual err:%v)", size, getVMSSError.error)
+		return size, getVMSSError.error
 	}
 	return size, nil
 }
@@ -657,10 +661,13 @@ func (scaleSet *ScaleSet) TemplateNodeInfo() (*schedulerframework.NodeInfo, erro
 
 // Nodes returns a list of all nodes that belong to this node group.
 func (scaleSet *ScaleSet) Nodes() ([]cloudprovider.Instance, error) {
-	curSize, err := scaleSet.getCurSize()
-	if err != nil {
-		klog.Errorf("Failed to get current size for vmss %q: %v", scaleSet.Name, err)
-		return nil, err
+	curSize, getVMSSError := scaleSet.getCurSize()
+	if getVMSSError != nil {
+		klog.Errorf("Failed to get current size for vmss %q: %v", scaleSet.Name, getVMSSError.error)
+		if getVMSSError.notFound {
+			return []cloudprovider.Instance{}, nil // Don't return error if VMSS not found
+		}
+		return nil, getVMSSError.error // We want to return error if other errors occur.
 	}
 
 	scaleSet.instanceMutex.Lock()
@@ -673,7 +680,7 @@ func (scaleSet *ScaleSet) Nodes() ([]cloudprovider.Instance, error) {
 	}
 
 	// Forcefully updating the instanceCache as the instanceCacheSize didn't match curSize or cache is invalid.
-	err = scaleSet.updateInstanceCache()
+	err := scaleSet.updateInstanceCache()
 	if err != nil {
 		return nil, err
 	}
@@ -892,3 +899,21 @@ func (scaleSet *ScaleSet) verifyNodeGroup(instance *azureRef, commonNgID string)
 	}
 	return nil
 }
+
+// GetVMSSFailedError is used to differentiate between
+// NotFound and other errors
+type GetVMSSFailedError struct {
+	notFound bool
+	error    error
+}
+
+func newGetVMSSFailedError(error error, notFound bool) *GetVMSSFailedError {
+	return &GetVMSSFailedError{
+		error:    error,
+		notFound: notFound,
+	}
+}
+
+func (v *GetVMSSFailedError) Error() string {
+	return v.error.Error()
+}