Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AKS] az aks nodepool add/update/upgrade: Add new parameter --drain-timout to slow down the upgrade #27475

Merged
merged 11 commits into from
Oct 10, 2023
9 changes: 9 additions & 0 deletions src/azure-cli/azure/cli/command_modules/acs/_help.py
Original file line number Diff line number Diff line change
Expand Up @@ -1480,6 +1480,9 @@
- name: --max-surge
type: string
short-summary: Extra nodes used to speed upgrade. When specified, it represents the number or percent used, eg. 5 or 33%
- name: --drain-timeout
type: string
paulgmiller marked this conversation as resolved.
Show resolved Hide resolved
short-summary: When nodes are drain how many minutes to wait for all pods to be evicted
- name: --enable-encryption-at-host
type: bool
short-summary: Enable EncryptionAtHost, default value is false.
Expand Down Expand Up @@ -1586,6 +1589,9 @@
- name: --max-surge
type: string
short-summary: Extra nodes used to speed upgrade. When specified, it represents the number or percent used, eg. 5 or 33%
- name: --drain-timeout
type: string
short-summary: When nodes are drain how many minutes to wait for all pods to be evicted
- name: --node-taints
type: string
short-summary: The node taints for the node pool. You can update the existing node taint of a nodepool or create a new node taint for a nodepool. Pass the empty string `""` to remove all taints.
Expand Down Expand Up @@ -1619,6 +1625,9 @@
- name: --max-surge
type: string
short-summary: Extra nodes used to speed upgrade. When specified, it represents the number or percent used, eg. 5 or 33% (mutually exclusive with "--node-image-only". See "az aks nodepool update --max-surge" to update max surge before upgrading with "--node-image-only")
- name: --drain-timeout
type: string
short-summary: When nodes are drain how long to wait for all pods to be evicted
- name: --snapshot-id
type: string
short-summary: The source snapshot id used to upgrade this nodepool.
Expand Down
3 changes: 3 additions & 0 deletions src/azure-cli/azure/cli/command_modules/acs/_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -551,6 +551,7 @@ def load_arguments(self, _):
c.argument('node_osdisk_type', arg_type=get_enum_type(node_os_disk_types))
c.argument('node_osdisk_size', type=int)
c.argument('max_surge', validator=validate_max_surge)
c.argument('drain_timeout', type=int) #not bothering to validate posiive since RP will handle that though zero will just be omitted.
paulgmiller marked this conversation as resolved.
Show resolved Hide resolved
c.argument('mode', get_enum_type(node_mode_types))
c.argument('scale_down_mode', arg_type=get_enum_type(scale_down_modes))
c.argument('max_pods', type=int, options_list=['--max-pods', '-m'])
Expand All @@ -577,11 +578,13 @@ def load_arguments(self, _):
c.argument('tags', tags_type)
c.argument('node_taints', validator=validate_nodepool_taints)
c.argument('max_surge', validator=validate_max_surge)
c.argument('drain_timeout', type=int) #not bothering to validate posiive since RP will handle that though zero will just be omitted.
c.argument('mode', get_enum_type(node_mode_types))
c.argument('scale_down_mode', arg_type=get_enum_type(scale_down_modes))

with self.argument_context('aks nodepool upgrade') as c:
c.argument('max_surge', validator=validate_max_surge)
c.argument('drain_timeout', type=int) #not bothering to validate posiive since RP will handle that though zero will just be omitted.
c.argument('snapshot_id', validator=validate_snapshot_id)
c.argument('yes', options_list=['--yes', '-y'], help='Do not prompt for confirmation.', action='store_true')

Expand Down
2 changes: 1 addition & 1 deletion src/azure-cli/azure/cli/command_modules/acs/_validators.py
Original file line number Diff line number Diff line change
Expand Up @@ -464,7 +464,7 @@ def validate_max_surge(namespace):
raise CLIError("--max-surge must be positive")
except ValueError:
raise CLIError("--max-surge should be an int or percentage")

paulgmiller marked this conversation as resolved.
Show resolved Hide resolved

def validate_assign_identity(namespace):
if namespace.assign_identity is not None:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -930,6 +930,26 @@ def get_max_surge(self):
# this parameter does not need dynamic completion
# this parameter does not need validation
return max_surge

def get_drain_timeout(self):
"""Obtain the value of drain_timeout.

:return: int
"""
# read the original value passed by the command
drain_timeout = self.raw_param.get("drain_timeout")
# In create mode, try to read the property value corresponding to the parameter from the `agentpool` object
if self.decorator_mode == DecoratorMode.CREATE:
if (
self.agentpool and
self.agentpool.upgrade_settings and
self.agentpool.upgrade_settings.drain_timeout is not None
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@paulgmiller @FumingZhang ^^^ shouldn't that be

self.agentpool.upgrade_settings.drain_timeout_in_minutes is not None

?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sigh I miss static typing.

):
drain_timeout = self.agentpool.upgrade_settings.drain_timeout_in_minutes

# this parameter does not need dynamic completion
# this parameter does not need validation
return drain_timeout

def get_vm_set_type(self) -> str:
"""Obtain the value of vm_set_type, default value is CONST_VIRTUAL_MACHINE_SCALE_SETS.
Expand Down Expand Up @@ -1468,6 +1488,11 @@ def set_up_upgrade_settings(self, agentpool: AgentPool) -> AgentPool:
max_surge = self.context.get_max_surge()
if max_surge:
upgrade_settings.max_surge = max_surge

drain_timeout = self.context.get_drain_timeout()
if drain_timeout:
upgrade_settings.drain_timeout_in_minutes = drain_timeout

agentpool.upgrade_settings = upgrade_settings
return agentpool

Expand Down Expand Up @@ -1726,7 +1751,13 @@ def update_upgrade_settings(self, agentpool: AgentPool) -> AgentPool:
max_surge = self.context.get_max_surge()
if max_surge:
upgrade_settings.max_surge = max_surge
agentpool.upgrade_settings = upgrade_settings #why not always set this? so we don't wipe out a preview feaure in upgrade settigns like NodeSoakDuration?

drain_timeout = self.context.get_drain_timeout()
if drain_timeout:
upgrade_settings.drain_timeout_in_minutes = drain_timeout
agentpool.upgrade_settings = upgrade_settings

return agentpool

def update_vm_properties(self, agentpool: AgentPool) -> AgentPool:
Expand Down
14 changes: 10 additions & 4 deletions src/azure-cli/azure/cli/command_modules/acs/custom.py
Original file line number Diff line number Diff line change
Expand Up @@ -2177,6 +2177,7 @@ def aks_agentpool_add(
node_osdisk_type=None,
node_osdisk_size=None,
max_surge=None,
drain_timeout=None,
mode=CONST_NODEPOOL_MODE_USER,
scale_down_mode=CONST_SCALE_DOWN_MODE_DELETE,
max_pods=None,
Expand Down Expand Up @@ -2230,6 +2231,7 @@ def aks_agentpool_update(
tags=None,
node_taints=None,
max_surge=None,
drain_timeout=None,
mode=None,
scale_down_mode=None,
no_wait=False,
Expand Down Expand Up @@ -2267,6 +2269,7 @@ def aks_agentpool_upgrade(cmd, client, resource_group_name, cluster_name,
kubernetes_version='',
node_image_only=False,
max_surge=None,
drain_timeout=None,
snapshot_id=None,
no_wait=False,
aks_custom_headers=None,
Expand All @@ -2284,11 +2287,11 @@ def aks_agentpool_upgrade(cmd, client, resource_group_name, cluster_name,
)

# Note: we exclude this option because node image upgrade can't accept nodepool put fields like max surge
if max_surge and node_image_only:
if (max_surge or drain_timeout) and node_image_only:
raise MutuallyExclusiveArgumentError(
'Conflicting flags. Unable to specify max-surge with node-image-only.'
'If you want to use max-surge with a node image upgrade, please first '
'update max-surge using "az aks nodepool update --max-surge".'
'Conflicting flags. Unable to specify max-surge/drain_timeout with node-image-only.'
'If you want to use max-surge/drain_timeout with a node image upgrade, please first '
'update max-surge/drain_timeout using "az aks nodepool update --max-surge".'
paulgmiller marked this conversation as resolved.
Show resolved Hide resolved
)

if node_image_only:
Expand Down Expand Up @@ -2335,6 +2338,9 @@ def aks_agentpool_upgrade(cmd, client, resource_group_name, cluster_name,

if max_surge:
instance.upgrade_settings.max_surge = max_surge
if drain_timeout:
instance.upgrade_settings.drain_timeout_in_minutes = drain_timeout


# custom headers
aks_custom_headers = extract_comma_separated_string(
Expand Down