Skip to content

Commit

Permalink
Showing 62 changed files with 34,631 additions and 38 deletions.
6 changes: 6 additions & 0 deletions cluster-autoscaler/cloudprovider/azure/README.md
Original file line number Diff line number Diff line change
@@ -184,6 +184,12 @@ A configurable jitter (`AZURE_VMSS_VMS_CACHE_JITTER` environment variable, defau
| vmssVmsCacheTTL | 300 | AZURE_VMSS_VMS_CACHE_TTL | vmssVmsCacheTTL |
| vmssVmsCacheJitter | 0 | AZURE_VMSS_VMS_CACHE_JITTER | vmssVmsCacheJitter |

The `AZURE_ENABLE_DYNAMIC_INSTANCE_LIST` environment variable enables workflow that fetched SKU information dynamically using SKU API calls. By default, it uses static list of SKUs.

| Config Name | Default | Environment Variable | Cloud Config File |
|---------------------------|---------|------------------------------------|---------------------------|
| enableDynamicInstanceList | false | AZURE_ENABLE_DYNAMIC_INSTANCE_LIST | enableDynamicInstanceList |

When using K8s 1.18 or higher, it is also recommended to configure backoff and retries on the client as described [here](#rate-limit-and-back-off-retries)

### Standard deployment
7 changes: 7 additions & 0 deletions cluster-autoscaler/cloudprovider/azure/azure_client.go
Original file line number Diff line number Diff line change
@@ -23,6 +23,7 @@ import (
"net/http"
"time"

"github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2019-07-01/compute"
"github.com/Azure/azure-sdk-for-go/services/resources/mgmt/2017-05-10/resources"
"github.com/Azure/azure-sdk-for-go/services/storage/mgmt/2021-02-01/storage"
"github.com/Azure/go-autorest/autorest"
@@ -150,6 +151,7 @@ type azClient struct {
disksClient diskclient.Interface
storageAccountsClient storageaccountclient.Interface
managedKubernetesServicesClient containerserviceclient.Interface
skuClient compute.ResourceSkusClient
}

// newServicePrincipalTokenFromCredentials creates a new ServicePrincipalToken using values of the
@@ -263,6 +265,10 @@ func newAzClient(cfg *Config, env *azure.Environment) (*azClient, error) {
kubernetesServicesClient := containerserviceclient.New(aksClientConfig)
klog.V(5).Infof("Created kubernetes services client with authorizer: %v", kubernetesServicesClient)

skuClient := compute.NewResourceSkusClient(cfg.SubscriptionID)
skuClient.Authorizer = azClientConfig.Authorizer
klog.V(5).Infof("Created sku client with authorizer: %v", skuClient)

return &azClient{
disksClient: disksClient,
interfacesClient: interfacesClient,
@@ -272,5 +278,6 @@ func newAzClient(cfg *Config, env *azure.Environment) (*azClient, error) {
virtualMachinesClient: virtualMachinesClient,
storageAccountsClient: storageAccountsClient,
managedKubernetesServicesClient: kubernetesServicesClient,
skuClient: skuClient,
}, nil
}
15 changes: 15 additions & 0 deletions cluster-autoscaler/cloudprovider/azure/azure_config.go
Original file line number Diff line number Diff line change
@@ -59,6 +59,9 @@ const (
// auth methods
authMethodPrincipal = "principal"
authMethodCLI = "cli"

// toggle
dynamicInstanceListDefault = false
)

// CloudProviderRateLimitConfig indicates the rate limit config for each clients.
@@ -128,6 +131,9 @@ type Config struct {
CloudProviderBackoffExponent float64 `json:"cloudProviderBackoffExponent,omitempty" yaml:"cloudProviderBackoffExponent,omitempty"`
CloudProviderBackoffDuration int `json:"cloudProviderBackoffDuration,omitempty" yaml:"cloudProviderBackoffDuration,omitempty"`
CloudProviderBackoffJitter float64 `json:"cloudProviderBackoffJitter,omitempty" yaml:"cloudProviderBackoffJitter,omitempty"`

// EnableDynamicInstanceList defines whether to enable dynamic instance workflow for instance information check
EnableDynamicInstanceList bool `json:"enableDynamicInstanceList,omitempty" yaml:"enableDynamicInstanceList,omitempty"`
}

// BuildAzureConfig returns a Config object for the Azure clients
@@ -212,6 +218,15 @@ func BuildAzureConfig(configReader io.Reader) (*Config, error) {
}
}

if enableDynamicInstanceList := os.Getenv("AZURE_ENABLE_DYNAMIC_INSTANCE_LIST"); enableDynamicInstanceList != "" {
cfg.EnableDynamicInstanceList, err = strconv.ParseBool(enableDynamicInstanceList)
if err != nil {
return nil, fmt.Errorf("failed to parse AZURE_ENABLE_DYNAMIC_INSTANCE_LIST %q: %v", enableDynamicInstanceList, err)
}
} else {
cfg.EnableDynamicInstanceList = dynamicInstanceListDefault
}

if cfg.CloudProviderBackoff {
if backoffRetries := os.Getenv("BACKOFF_RETRIES"); backoffRetries != "" {
retries, err := strconv.ParseInt(backoffRetries, 10, 0)
107 changes: 107 additions & 0 deletions cluster-autoscaler/cloudprovider/azure/azure_instance.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package azure

import (
"context"
"fmt"
compute20190701 "github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2019-07-01/compute"
"github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2020-12-01/compute"
"github.com/Azure/skewer"
"k8s.io/klog/v2"
"regexp"
"strings"
)

// GetVMSSTypeStatically uses static list of vmss generated at azure_instance_types.go to fetch vmss instance information.
// It is declared as a variable for testing purpose.
var GetVMSSTypeStatically = func(template compute.VirtualMachineScaleSet) (*InstanceType, error) {
var vmssType *InstanceType

for k := range InstanceTypes {
if strings.EqualFold(k, *template.Sku.Name) {
vmssType = InstanceTypes[k]
break
}
}

promoRe := regexp.MustCompile(`(?i)_promo`)
if promoRe.MatchString(*template.Sku.Name) {
if vmssType == nil {
// We didn't find an exact match but this is a promo type, check for matching standard
klog.V(4).Infof("No exact match found for %s, checking standard types", *template.Sku.Name)
skuName := promoRe.ReplaceAllString(*template.Sku.Name, "")
for k := range InstanceTypes {
if strings.EqualFold(k, skuName) {
vmssType = InstanceTypes[k]
break
}
}
}
}
if vmssType == nil {
return vmssType, fmt.Errorf("instance type %q not supported", *template.Sku.Name)
}
return vmssType, nil
}

// GetVMSSTypeDynamically fetched vmss instance information using sku api calls.
// It is declared as a variable for testing purpose.
var GetVMSSTypeDynamically = func(template compute.VirtualMachineScaleSet, skuClient compute20190701.ResourceSkusClient) (InstanceType, error) {
ctx := context.Background()
var sku skewer.SKU
var vmssType InstanceType

cache, err := skewer.NewCache(ctx, skewer.WithLocation(*template.Location), skewer.WithResourceClient(skuClient))
if err != nil {
klog.V(1).Infof("Failed to instantiate cache, err: %v", err)
return vmssType, err
}

sku, err = cache.Get(ctx, *template.Sku.Name, skewer.VirtualMachines, *template.Location)
if err != nil {
// We didn't find an exact match but this is a promo type, check for matching standard
klog.V(1).Infof("No exact match found for %s, checking standard types. Error %v", *template.Sku.Name, err)
promoRe := regexp.MustCompile(`(?i)_promo`)
skuName := promoRe.ReplaceAllString(*template.Sku.Name, "")
sku, err = cache.Get(context.Background(), skuName, skewer.VirtualMachines, *template.Location)
if err != nil {
return vmssType, fmt.Errorf("instance type %q not supported. Error %v", *template.Sku.Name, err)
}
}

vmssType.VCPU, err = sku.VCPU()
if err != nil {
klog.V(1).Infof("Failed to parse vcpu from sku %q %v", *template.Sku.Name, err)
return vmssType, err
}
gpu, err := getGpuFromSku(sku)
if err != nil {
klog.V(1).Infof("Failed to parse gpu from sku %q %v", *template.Sku.Name, err)
return vmssType, err
}
vmssType.GPU = gpu

memoryGb, err := sku.Memory()
if err != nil {
klog.V(1).Infof("Failed to parse memoryMb from sku %q %v", *template.Sku.Name, err)
return vmssType, err
}
vmssType.MemoryMb = int64(memoryGb) * 1024

return vmssType, nil
}
105 changes: 105 additions & 0 deletions cluster-autoscaler/cloudprovider/azure/azure_instance_gpu_sku.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package azure

import (
"github.com/Azure/skewer"
"github.com/pkg/errors"
"strings"
)

var (
// NvidiaEnabledSKUs represents a list of NVIDIA gpus.
// If a new GPU sku becomes available, add a key to this map, but only if you have a confirmation
// that we have an agreement with NVIDIA for this specific gpu.
NvidiaEnabledSKUs = map[string]bool{
// K80
"standard_nc6": true,
"standard_nc12": true,
"standard_nc24": true,
"standard_nc24r": true,
// M60
"standard_nv6": true,
"standard_nv12": true,
"standard_nv12s_v3": true,
"standard_nv24": true,
"standard_nv24s_v3": true,
"standard_nv24r": true,
"standard_nv48s_v3": true,
// P40
"standard_nd6s": true,
"standard_nd12s": true,
"standard_nd24s": true,
"standard_nd24rs": true,
// P100
"standard_nc6s_v2": true,
"standard_nc12s_v2": true,
"standard_nc24s_v2": true,
"standard_nc24rs_v2": true,
// V100
"standard_nc6s_v3": true,
"standard_nc12s_v3": true,
"standard_nc24s_v3": true,
"standard_nc24rs_v3": true,
"standard_nd40s_v3": true,
"standard_nd40rs_v2": true,
// T4
"standard_nc4as_t4_v3": true,
"standard_nc8as_t4_v3": true,
"standard_nc16as_t4_v3": true,
"standard_nc64as_t4_v3": true,
// A100 40GB
"standard_nd96asr_v4": true,
"standard_nd112asr_a100_v4": true,
"standard_nd120asr_a100_v4": true,
// A100 80GB
"standard_nd96amsr_a100_v4": true,
"standard_nd112amsr_a100_v4": true,
"standard_nd120amsr_a100_v4": true,
// A100 PCIE 80GB
"standard_nc24ads_a100_v4": true,
"standard_nc48ads_a100_v4": true,
"standard_nc96ads_a100_v4": true,
}
)

// isNvidiaEnabledSKU determines if an VM SKU has nvidia driver support.
func isNvidiaEnabledSKU(vmSize string) bool {
// Trim the optional _Promo suffix.
vmSize = strings.ToLower(vmSize)
vmSize = strings.TrimSuffix(vmSize, "_promo")
return NvidiaEnabledSKUs[vmSize]
}

// getGpuFromSku extracts gpu information from vmss sku.
func getGpuFromSku(sku skewer.SKU) (int64, error) {
errCapabilityValueNil := &skewer.ErrCapabilityValueNil{}
errCapabilityNotFound := &skewer.ErrCapabilityNotFound{}

var value int64
value, err := sku.GetCapabilityIntegerQuantity("GPUs")
if err != nil {
// In case of an error, SKU api returns -1 as the value.
// Updating value=0 if it's a non-gpu sku or value is nil.
if errors.As(err, &errCapabilityValueNil) || errors.As(err, &errCapabilityNotFound) {
value = 0
} else {
return value, err
}
}
return value, nil
}
18 changes: 11 additions & 7 deletions cluster-autoscaler/cloudprovider/azure/azure_scale_set.go
Original file line number Diff line number Diff line change
@@ -52,6 +52,8 @@ type ScaleSet struct {
sizeMutex sync.Mutex
curSize int64

enableDynamicInstanceList bool

lastSizeRefresh time.Time
sizeRefreshPeriod time.Duration

@@ -69,12 +71,13 @@ func NewScaleSet(spec *dynamic.NodeGroupSpec, az *AzureManager, curSize int64) (
azureRef: azureRef{
Name: spec.Name,
},
minSize: spec.MinSize,
maxSize: spec.MaxSize,
manager: az,
curSize: curSize,
sizeRefreshPeriod: az.azureCache.refreshInterval,
instancesRefreshJitter: az.config.VmssVmsCacheJitter,
minSize: spec.MinSize,
maxSize: spec.MaxSize,
manager: az,
curSize: curSize,
sizeRefreshPeriod: az.azureCache.refreshInterval,
enableDynamicInstanceList: az.config.EnableDynamicInstanceList,
instancesRefreshJitter: az.config.VmssVmsCacheJitter,
}

if az.config.VmssVmsCacheTTL != 0 {
@@ -476,7 +479,8 @@ func (scaleSet *ScaleSet) TemplateNodeInfo() (*schedulerframework.NodeInfo, erro
return nil, err
}

node, err := buildNodeFromTemplate(scaleSet.Name, template)
node, err := buildNodeFromTemplate(scaleSet.Name, template, scaleSet.manager.azClient.skuClient,
scaleSet.enableDynamicInstanceList)
if err != nil {
return nil, err
}
Loading

0 comments on commit 5dbff8d

Please sign in to comment.