Skip to content

Commit

Permalink
Merged PR 3441: CAS 1.29 cherry-picks
Browse files Browse the repository at this point in the history
**What type of PR is this?** (bug, cleanup, documentation, feature)
1.29 release

**What this PR does / why we need it?**:
aks custom cherry-picks for 1.29

**Does this PR introduce a user-facing change?**

**WorkItem**
<!--- NOTE: REPLACE THIS LINE WITH THE WORKITEM SUFFIXED WITH "https://dev.azure.com/msazure/CloudNativeCompute".
Currently, this CAS repo is under  "https://dev.azure.com/AzureContainerUpstream", workItems are under "https://dev.azure.com/msazure/CloudNativeCompute" and ADO doesn't support cross-linking.---->

**Testing**
- [x] Unit tests
- [ ] E2E tests (we can custom branches on standalone env only)
- [ ] Other tests (eg. manual testing).

<details>
<summary>Cherry-pick details</summary>

**Is this a cherry-picked PR?**
- [ ] Yes. Include original PR.
```
```
- [x] No. On which internal versions / branches is this PR getting cherry-picked ?
```
new release 1.29
```

**Is this PR getting merged on [upstream](https://github.com/kubernetes/autoscaler) ?**
- [ ] Yes. Which branches / versions including master?
```
```
- [x] No. Why?
```
new release 1.29
```
</details>

<br>

**Special notes for your reviewer**:

**Additional Documentation**:
  • Loading branch information
gandhipr committed Jan 25, 2024
2 parents 33fb788 + 99d655f commit bfb24ba
Show file tree
Hide file tree
Showing 1,446 changed files with 309,888 additions and 11,092 deletions.
40 changes: 40 additions & 0 deletions .azuredevops/pull_request_template.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
**What type of PR is this?** (bug, cleanup, documentation, feature)

**What this PR does / why we need it?**:

**Does this PR introduce a user-facing change?**

**WorkItem**
<!--- NOTE: REPLACE THIS LINE WITH THE WORKITEM SUFFIXED WITH "https://dev.azure.com/msazure/CloudNativeCompute".
Currently, this CAS repo is under "https://dev.azure.com/AzureContainerUpstream", workItems are under "https://dev.azure.com/msazure/CloudNativeCompute" and ADO doesn't support cross-linking.---->

**Testing**
- [ ] Unit tests
- [ ] E2E tests (we can custom branches on standalone env only)
- [ ] Other tests (eg. manual testing).

<details>
<summary>Cherry-pick details</summary>

**Is this a cherry-picked PR?**
- [ ] Yes. Include original PR.
```
```
- [ ] No. On which internal versions / branches is this PR getting cherry-picked ?
```
```

**Is this PR getting merged on [upstream](https://github.com/kubernetes/autoscaler) ?**
- [ ] Yes. Which branches / versions including master?
```
```
- [ ] No. Why?
```
```
</details>

<br>

**Special notes for your reviewer**:

**Additional Documentation**:
48 changes: 48 additions & 0 deletions .pipelines/cluster-autoscaler.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
pr:
- master
- cluster-autoscaler-release-*

variables:
- name: GOPATH
value: $(Agent.BuildDirectory)/go
- name: GOBIN
value: $(GOPATH)/bin
- name: GO111MODULE
value: auto
- name: autoscaler.clone.dir
value: go/src/k8s.io/autoscaler
- name: DisableDockerDetector
value: true

jobs:
- job: cluster_autoscaler_test
pool: staging-pool-amd64-mariner-2
displayName: Autoscaler Unit Tests
workspace:
clean: all
steps:
- checkout: self
path: $(autoscaler.clone.dir)
- script: |
echo '##vso[task.prependpath]$(GOPATH)/bin'
hack/install-verify-tools.sh
go get github.com/t-yuki/gocover-cobertura
displayName: "Prepare"
- task: UseDotNet@2
inputs:
version: 7.0.x
displayName: 'Prepare .NET Core SDK'
- script: |
hack/verify-all.sh -v
displayName: "Verify"
- script: |
hack/for-go-proj.sh coverage
displayName: "Test"
- script: |
gocover-cobertura < '$(System.DefaultWorkingDirectory)/cluster-autoscaler/coverage.txt' > '$(System.DefaultWorkingDirectory)/cluster-autoscaler/coverage.xml'
displayName: 'Convert'
- task: PublishCodeCoverageResults@1
inputs:
codeCoverageTool: 'Cobertura'
summaryFileLocation: '$(System.DefaultWorkingDirectory)/cluster-autoscaler/coverage.xml'
displayName: 'Publish Coverage'
4 changes: 2 additions & 2 deletions cluster-autoscaler/cloudprovider/azure/azure_agent_pool.go
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ func (as *AgentPool) GetVMIndexes() ([]int, map[int]string, error) {
}

indexes = append(indexes, index)
resourceID, err := convertResourceGroupNameToLower("azure://" + *instance.ID)
resourceID, err := convertResourceGroupNameToLower(azurePrefix + *instance.ID)
if err != nil {
return nil, nil, err
}
Expand Down Expand Up @@ -484,7 +484,7 @@ func (as *AgentPool) Nodes() ([]cloudprovider.Instance, error) {

// To keep consistent with providerID from kubernetes cloud provider, convert
// resourceGroupName in the ID to lower case.
resourceID, err := convertResourceGroupNameToLower("azure://" + *instance.ID)
resourceID, err := convertResourceGroupNameToLower(azurePrefix + *instance.ID)
if err != nil {
return nil, err
}
Expand Down
27 changes: 18 additions & 9 deletions cluster-autoscaler/cloudprovider/azure/azure_agent_pool_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ import (
"github.com/Azure/azure-sdk-for-go/services/storage/mgmt/2021-09-01/storage"
"github.com/Azure/go-autorest/autorest/date"
"github.com/Azure/go-autorest/autorest/to"
"github.com/golang/mock/gomock"
"github.com/stretchr/testify/assert"
"go.uber.org/mock/gomock"
)

var (
Expand Down Expand Up @@ -185,7 +185,8 @@ func TestGetVMsFromCache(t *testing.T) {
mockVMClient := mockvmclient.NewMockInterface(ctrl)
testAS.manager.azClient.virtualMachinesClient = mockVMClient
mockVMClient.EXPECT().List(gomock.Any(), testAS.manager.config.ResourceGroup).Return(expectedVMs, nil)
ac, err := newAzureCache(testAS.manager.azClient, refreshInterval, testAS.manager.config.ResourceGroup, vmTypeStandard, false, "")
testAS.manager.config.VMType = vmTypeStandard
ac, err := newAzureCache(testAS.manager.azClient, refreshInterval, *testAS.manager.config)
assert.NoError(t, err)
testAS.manager.azureCache = ac

Expand All @@ -203,7 +204,8 @@ func TestGetVMIndexes(t *testing.T) {
mockVMClient := mockvmclient.NewMockInterface(ctrl)
as.manager.azClient.virtualMachinesClient = mockVMClient
mockVMClient.EXPECT().List(gomock.Any(), as.manager.config.ResourceGroup).Return(expectedVMs, nil)
ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard, false, "")
as.manager.config.VMType = vmTypeStandard
ac, err := newAzureCache(as.manager.azClient, refreshInterval, *as.manager.config)
assert.NoError(t, err)
as.manager.azureCache = ac

Expand Down Expand Up @@ -242,7 +244,8 @@ func TestGetCurSize(t *testing.T) {
mockVMClient := mockvmclient.NewMockInterface(ctrl)
as.manager.azClient.virtualMachinesClient = mockVMClient
mockVMClient.EXPECT().List(gomock.Any(), as.manager.config.ResourceGroup).Return(expectedVMs, nil)
ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard, false, "")
as.manager.config.VMType = vmTypeStandard
ac, err := newAzureCache(as.manager.azClient, refreshInterval, *as.manager.config)
assert.NoError(t, err)
as.manager.azureCache = ac

Expand All @@ -266,7 +269,8 @@ func TestAgentPoolTargetSize(t *testing.T) {
as.manager.azClient.virtualMachinesClient = mockVMClient
expectedVMs := getExpectedVMs()
mockVMClient.EXPECT().List(gomock.Any(), as.manager.config.ResourceGroup).Return(expectedVMs, nil)
ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard, false, "")
as.manager.config.VMType = vmTypeStandard
ac, err := newAzureCache(as.manager.azClient, refreshInterval, *as.manager.config)
assert.NoError(t, err)
as.manager.azureCache = ac

Expand All @@ -285,7 +289,8 @@ func TestAgentPoolIncreaseSize(t *testing.T) {
as.manager.azClient.virtualMachinesClient = mockVMClient
expectedVMs := getExpectedVMs()
mockVMClient.EXPECT().List(gomock.Any(), as.manager.config.ResourceGroup).Return(expectedVMs, nil).MaxTimes(2)
ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard, false, "")
as.manager.config.VMType = vmTypeStandard
ac, err := newAzureCache(as.manager.azClient, refreshInterval, *as.manager.config)
assert.NoError(t, err)
as.manager.azureCache = ac

Expand Down Expand Up @@ -313,7 +318,8 @@ func TestDecreaseTargetSize(t *testing.T) {
as.manager.azClient.virtualMachinesClient = mockVMClient
expectedVMs := getExpectedVMs()
mockVMClient.EXPECT().List(gomock.Any(), as.manager.config.ResourceGroup).Return(expectedVMs, nil).MaxTimes(3)
ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard, false, "")
as.manager.config.VMType = vmTypeStandard
ac, err := newAzureCache(as.manager.azClient, refreshInterval, *as.manager.config)
assert.NoError(t, err)
as.manager.azureCache = ac

Expand Down Expand Up @@ -431,7 +437,9 @@ func TestAgentPoolDeleteNodes(t *testing.T) {
mockSAClient := mockstorageaccountclient.NewMockInterface(ctrl)
as.manager.azClient.storageAccountsClient = mockSAClient
mockVMClient.EXPECT().List(gomock.Any(), as.manager.config.ResourceGroup).Return(expectedVMs, nil)
ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard, false, "")
as.manager.config.VMType = vmTypeStandard
ac, err := newAzureCache(as.manager.azClient, refreshInterval, *as.manager.config)
as.manager.config.VMType = vmTypeVMSS
assert.NoError(t, err)
as.manager.azureCache = ac

Expand Down Expand Up @@ -497,7 +505,8 @@ func TestAgentPoolNodes(t *testing.T) {
mockVMClient := mockvmclient.NewMockInterface(ctrl)
as.manager.azClient.virtualMachinesClient = mockVMClient
mockVMClient.EXPECT().List(gomock.Any(), as.manager.config.ResourceGroup).Return(expectedVMs, nil)
ac, err := newAzureCache(as.manager.azClient, refreshInterval, as.manager.config.ResourceGroup, vmTypeStandard, false, "")
as.manager.config.VMType = vmTypeStandard
ac, err := newAzureCache(as.manager.azClient, refreshInterval, *as.manager.config)
assert.NoError(t, err)
as.manager.azureCache = ac

Expand Down
74 changes: 55 additions & 19 deletions cluster-autoscaler/cloudprovider/azure/azure_cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,34 +39,70 @@ var (
// azureCache is used for caching cluster resources state.
//
// It is needed to:
// - keep track of node groups (VM and VMSS types) in the cluster,
// - keep track of instances and which node group they belong to,
// - limit repetitive Azure API calls.
// - keep track of node groups (AKS, VM and VMSS types) in the cluster,
// - keep track of instances and which node group they belong to,
// (for VMSS it only keeps track of instanceid-to-nodegroup mapping)
// - limit repetitive Azure API calls.
//
// It backs efficient responds to
// - cloudprovider.NodeGroups() (= registeredNodeGroups)
// - cloudprovider.NodeGroupForNode (via azureManager.GetNodeGroupForInstance => FindForInstance,
// using instanceToNodeGroup and unownedInstances)
//
// CloudProvider.Refresh, called before every autoscaler loop (every 10s by defaul),
// is implemented by AzureManager.Refresh which makes the cache refresh decision,
// based on AzureManager.lastRefresh and azureCache.refreshInterval.
type azureCache struct {
mutex sync.Mutex
interrupt chan struct{}
azClient *azClient
mutex sync.Mutex
interrupt chan struct{}
azClient *azClient

// refreshInterval specifies how often azureCache needs to be refreshed.
// The value comes from AZURE_VMSS_CACHE_TTL env var (or 1min if not specified),
// and is also used by some other caches. Together with AzureManager.lastRefresh,
// it is uses to decide whether a refresh is needed.
refreshInterval time.Duration

// Cache content.
resourceGroup string
vmType string
scaleSets map[string]compute.VirtualMachineScaleSet
virtualMachines map[string][]compute.VirtualMachine

// resourceGroup specifies the name of the resource group that this cache tracks
resourceGroup string

// vmType can be one of vmTypeVMSS (default), vmTypeStandard, vmTypeAKS
vmType string

// scaleSets keeps the set of all known scalesets in the resource group, populated/refreshed via VMSS.List() call.
// It is only used/populated if vmType is vmTypeVMSS (default).
scaleSets map[string]compute.VirtualMachineScaleSet
// virtualMachines keeps the set of all VMs in the resource group.
// It is only used/populated if vmType is vmTypeStandard or vmTypeAKS.
virtualMachines map[string][]compute.VirtualMachine

// registeredNodeGroups represents all known NodeGroups.
registeredNodeGroups []cloudprovider.NodeGroup
instanceToNodeGroup map[azureRef]cloudprovider.NodeGroup
unownedInstances map[azureRef]bool
autoscalingOptions map[azureRef]map[string]string
skus map[string]*skewer.Cache

// instanceToNodeGroup maintains a mapping from instance Ids to nodegroups.
// It is populated from the results of calling Nodes() on each nodegroup.
// It is used (together with unownedInstances) when looking up the nodegroup
// for a given instance id (see FindForInstance).
instanceToNodeGroup map[azureRef]cloudprovider.NodeGroup

// unownedInstance maintains a set of instance ids not belonging to any nodegroup.
// It is used (together with instanceToNodeGroup) when looking up the nodegroup for a given instance id.
// It is reset by invalidateUnownedInstanceCache().
unownedInstances map[azureRef]bool

autoscalingOptions map[azureRef]map[string]string
skus map[string]*skewer.Cache
}

func newAzureCache(client *azClient, cacheTTL time.Duration, resourceGroup, vmType string, enableDynamicInstanceList bool, defaultLocation string) (*azureCache, error) {
func newAzureCache(client *azClient, cacheTTL time.Duration, config Config) (*azureCache, error) {
cache := &azureCache{
interrupt: make(chan struct{}),
azClient: client,
refreshInterval: cacheTTL,
resourceGroup: resourceGroup,
vmType: vmType,
resourceGroup: config.ResourceGroup,
vmType: config.VMType,
scaleSets: make(map[string]compute.VirtualMachineScaleSet),
virtualMachines: make(map[string][]compute.VirtualMachine),
registeredNodeGroups: make([]cloudprovider.NodeGroup, 0),
Expand All @@ -76,8 +112,8 @@ func newAzureCache(client *azClient, cacheTTL time.Duration, resourceGroup, vmTy
skus: make(map[string]*skewer.Cache),
}

if enableDynamicInstanceList {
cache.skus[defaultLocation] = &skewer.Cache{}
if config.EnableDynamicInstanceList {
cache.skus[config.Location] = &skewer.Cache{}
}

if err := cache.regenerate(); err != nil {
Expand Down
1 change: 1 addition & 0 deletions cluster-autoscaler/cloudprovider/azure/azure_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,7 @@ func newAzClient(cfg *Config, env *azure.Environment) (*azClient, error) {
// https://github.com/Azure/go-autorest/blob/main/autorest/azure/environments.go
skuClient := compute.NewResourceSkusClientWithBaseURI(azClientConfig.ResourceManagerEndpoint, cfg.SubscriptionID)
skuClient.Authorizer = azClientConfig.Authorizer
skuClient.UserAgent = azClientConfig.UserAgent
klog.V(5).Infof("Created sku client with authorizer: %v", skuClient)

return &azClient{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ import (

const (
// GPULabel is the label added to nodes with GPU resource.
GPULabel = "accelerator"
GPULabel = AKSLabelKeyPrefixValue + "accelerator"
legacyGPULabel = "accelerator"
)

var (
Expand Down Expand Up @@ -176,7 +177,7 @@ func BuildAzure(opts config.AutoscalingOptions, do cloudprovider.NodeGroupDiscov
} else {
klog.Info("Creating Azure Manager with default configuration.")
}
manager, err := CreateAzureManager(config, do)
manager, err := CreateAzureManager(config, do, opts)
if err != nil {
klog.Fatalf("Failed to create Azure Manager: %v", err)
}
Expand Down
Loading

0 comments on commit bfb24ba

Please sign in to comment.