diff --git a/community/examples/AMD/hpc-amd-slurm.yaml b/community/examples/AMD/hpc-amd-slurm.yaml index 211f41e9cc..0d635475e9 100644 --- a/community/examples/AMD/hpc-amd-slurm.yaml +++ b/community/examples/AMD/hpc-amd-slurm.yaml @@ -178,7 +178,6 @@ deployment_groups: machine_type: c2d-standard-4 node_count_dynamic_max: 10 bandwidth_tier: gvnic_enabled - enable_placement: false allow_automatic_updates: false - id: low_cost_partition @@ -194,7 +193,6 @@ deployment_groups: machine_type: c2d-standard-112 node_count_dynamic_max: 50 bandwidth_tier: gvnic_enabled - enable_placement: true allow_automatic_updates: false # Because is_default is set to true, jobs will run on this partition unless an diff --git a/community/examples/hpc-slurm-sharedvpc.yaml b/community/examples/hpc-slurm-sharedvpc.yaml index 827824e432..c2302b17c9 100644 --- a/community/examples/hpc-slurm-sharedvpc.yaml +++ b/community/examples/hpc-slurm-sharedvpc.yaml @@ -61,7 +61,6 @@ deployment_groups: settings: node_count_dynamic_max: 4 machine_type: n2-standard-2 - enable_placement: false # the default is: true allow_automatic_updates: false - id: debug_partition diff --git a/community/examples/hpc-slurm-ubuntu2004.yaml b/community/examples/hpc-slurm-ubuntu2004.yaml index d4d0a5dae4..20bc81d506 100644 --- a/community/examples/hpc-slurm-ubuntu2004.yaml +++ b/community/examples/hpc-slurm-ubuntu2004.yaml @@ -57,7 +57,6 @@ deployment_groups: use: [network] settings: instance_image: $(vars.slurm_image) - enable_placement: false # the default is: true node_count_dynamic_max: 4 machine_type: n2-standard-2 diff --git a/community/examples/htc-slurm.yaml b/community/examples/htc-slurm.yaml index 4fef4d3aec..03e4093c20 100644 --- a/community/examples/htc-slurm.yaml +++ b/community/examples/htc-slurm.yaml @@ -88,7 +88,6 @@ deployment_groups: name: c2s60 node_count_dynamic_max: 200 bandwidth_tier: gvnic_enabled - enable_placement: false allow_automatic_updates: false - id: compute_nodeset_c2s30 @@ -98,7 +97,6 @@ deployment_groups: node_count_dynamic_max: 200 machine_type: c2-standard-30 bandwidth_tier: gvnic_enabled - enable_placement: false allow_automatic_updates: false - id: compute_partition @@ -122,7 +120,6 @@ deployment_groups: machine_type: n2-standard-2 node_count_dynamic_max: 10 bandwidth_tier: gvnic_enabled - enable_placement: false allow_automatic_updates: false - id: low_cost_nodeset_n2s4 @@ -133,7 +130,6 @@ deployment_groups: machine_type: n2-standard-4 node_count_dynamic_max: 10 bandwidth_tier: gvnic_enabled - enable_placement: false allow_automatic_updates: false - id: low_cost_partition diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md index 984097b2e0..8a875bcd04 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/README.md @@ -177,7 +177,7 @@ modules. For support with the underlying modules, see the instructions in the | [enable\_maintenance\_reservation](#input\_enable\_maintenance\_reservation) | Enables slurm reservation for scheduled maintenance. | `bool` | `false` | no | | [enable\_opportunistic\_maintenance](#input\_enable\_opportunistic\_maintenance) | On receiving maintenance notification, maintenance will be performed as soon as nodes becomes idle. | `bool` | `false` | no | | [enable\_oslogin](#input\_enable\_oslogin) | Enables Google Cloud os-login for user login and authentication for VMs.
See https://cloud.google.com/compute/docs/oslogin | `bool` | `true` | no | -| [enable\_placement](#input\_enable\_placement) | Enable placement groups. | `bool` | `true` | no | +| [enable\_placement](#input\_enable\_placement) | Use placement policy for VMs in this nodeset.
See: https://cloud.google.com/compute/docs/instances/placement-policies-overview
To set max\_distance of used policy, use `placement_max_distance` variable.

Enabled by default, reasons for users to disable it:
- If non-dense reservation is used, user can avoid extra-cost of creating placement policies;
- If user wants to avoid "all or nothing" VM provisioning behaviour;
- If user wants to intentionally have "spread" VMs (e.g. for reliability reasons) | `bool` | `true` | no | | [enable\_public\_ips](#input\_enable\_public\_ips) | If set to true. The node group VMs will have a random public IP assigned to it. Ignored if access\_config is set. | `bool` | `false` | no | | [enable\_shielded\_vm](#input\_enable\_shielded\_vm) | Enable the Shielded VM configuration. Note: the instance image must support option. | `bool` | `false` | no | | [enable\_smt](#input\_enable\_smt) | DEPRECATED: Use `advanced_machine_features.threads_per_core` instead. | `bool` | `null` | no | diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/outputs.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/outputs.tf index d618644d52..d9cdbb43e6 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/outputs.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/outputs.tf @@ -25,14 +25,6 @@ output "nodeset" { error_message = "A disk_type=${var.disk_type} cannot be used with machine_type=${var.machine_type}." } - precondition { - condition = var.reservation_name == "" || !var.enable_placement - error_message = <<-EOD - If a reservation is specified, `var.enable_placement` must be `false`. - If the specified reservation has a placement policy then it will be used automatically. - EOD - } - precondition { condition = var.reservation_name == "" || length(var.zones) == 0 error_message = <<-EOD diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf index 41e7e6dbd6..ee44853205 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset/variables.tf @@ -375,7 +375,16 @@ variable "disable_public_ips" { # tflint-ignore: terraform_unused_declarations variable "enable_placement" { - description = "Enable placement groups." + description = <<-EOD + Use placement policy for VMs in this nodeset. + See: https://cloud.google.com/compute/docs/instances/placement-policies-overview + To set max_distance of used policy, use `placement_max_distance` variable. + + Enabled by default, reasons for users to disable it: + - If non-dense reservation is used, user can avoid extra-cost of creating placement policies; + - If user wants to avoid "all or nothing" VM provisioning behaviour; + - If user wants to intentionally have "spread" VMs (e.g. for reliability reasons) + EOD type = bool default = true } diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-partition/main.tf b/community/modules/compute/schedmd-slurm-gcp-v6-partition/main.tf index 4d2e0eead4..64386d2509 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-partition/main.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-partition/main.tf @@ -13,9 +13,8 @@ # limitations under the License. locals { - non_static_ns_with_placement = [for ns in var.nodeset : ns.nodeset_name if ns.enable_placement && ns.node_count_static == 0] - use_static = [for ns in concat(var.nodeset, var.nodeset_tpu) : ns.nodeset_name if ns.node_count_static > 0] - uses_job_duration = length([for ns in var.nodeset : ns.dws_flex.use_job_duration if ns.dws_flex.use_job_duration]) > 0 ? true : false + use_static = [for ns in concat(var.nodeset, var.nodeset_tpu) : ns.nodeset_name if ns.node_count_static > 0] + uses_job_duration = length([for ns in var.nodeset : ns.dws_flex.use_job_duration if ns.dws_flex.use_job_duration]) > 0 ? true : false has_node = length(var.nodeset) > 0 has_dyn = length(var.nodeset_dyn) > 0 diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-partition/outputs.tf b/community/modules/compute/schedmd-slurm-gcp-v6-partition/outputs.tf index 4a06593b32..ff796ad0f6 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-partition/outputs.tf +++ b/community/modules/compute/schedmd-slurm-gcp-v6-partition/outputs.tf @@ -17,11 +17,6 @@ output "partitions" { value = [local.partition] - precondition { - condition = (length(local.non_static_ns_with_placement) == 0) || var.exclusive - error_message = "If any non-static nodesets has `enable_placement`, `var.exclusive` must be set true" - } - precondition { condition = (length(local.use_static) == 0) || !var.exclusive error_message = <<-EOD diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py index d68f7e8b41..110f5ee05f 100755 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/modules/slurm_files/scripts/resume.py @@ -178,7 +178,6 @@ def create_instances_request(nodes: List[str], placement_group: Optional[str], e ) if placement_group: - assert len(nodes) <= PLACEMENT_MAX_CNT pass # do not set minCount to force "all or nothing" behavior else: body["minCount"] = 1 diff --git a/docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml b/docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml index 0f96ec1ac5..cfcffdaa8e 100644 --- a/docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml +++ b/docs/hybrid-slurm-cluster/blueprints/hybrid-configuration.yaml @@ -75,7 +75,6 @@ deployment_groups: settings: partition_name: debug exclusive: false - enable_placement: false is_default: false - id: compute_node_group diff --git a/docs/tutorials/hpc-slurm-qwiklabs.yaml b/docs/tutorials/hpc-slurm-qwiklabs.yaml index f4bfc81941..156b8397f0 100644 --- a/docs/tutorials/hpc-slurm-qwiklabs.yaml +++ b/docs/tutorials/hpc-slurm-qwiklabs.yaml @@ -39,7 +39,6 @@ deployment_groups: settings: node_count_dynamic_max: 4 machine_type: n2d-standard-2 - enable_placement: false # the default is: true - id: debug_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition diff --git a/examples/hcls-blueprint.yaml b/examples/hcls-blueprint.yaml index cc4897d922..0b5a697fdc 100644 --- a/examples/hcls-blueprint.yaml +++ b/examples/hcls-blueprint.yaml @@ -333,7 +333,6 @@ deployment_groups: threads_per_core: null # Use platform default value node_count_dynamic_max: 20 machine_type: g2-standard-4 - enable_placement: False allow_automatic_updates: false - id: gpu_partition diff --git a/examples/hpc-enterprise-slurm.yaml b/examples/hpc-enterprise-slurm.yaml index 921cbf1921..9bc6b39b9e 100644 --- a/examples/hpc-enterprise-slurm.yaml +++ b/examples/hpc-enterprise-slurm.yaml @@ -118,7 +118,6 @@ deployment_groups: node_count_dynamic_max: 4 machine_type: n2-standard-2 instance_image: $(vars.slurm_image) - enable_placement: false # the default is: true allow_automatic_updates: false - id: n2_partition @@ -138,7 +137,6 @@ deployment_groups: node_count_dynamic_max: 20 machine_type: c2-standard-60 # this is the default instance_image: $(vars.slurm_image) - enable_placement: true bandwidth_tier: tier_1_enabled disk_type: pd-ssd disk_size_gb: 100 @@ -152,7 +150,7 @@ deployment_groups: settings: partition_name: c2 # the following two are true by default - exclusive: true # this must be true if nodeset.enable_placement is true + exclusive: true - id: c2d_nodeset source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset diff --git a/examples/hpc-slurm-static.yaml b/examples/hpc-slurm-static.yaml index 847e02e713..f72645441c 100644 --- a/examples/hpc-slurm-static.yaml +++ b/examples/hpc-slurm-static.yaml @@ -47,7 +47,6 @@ deployment_groups: settings: node_count_static: $(vars.static_node_count) node_count_dynamic_max: 0 - enable_placement: false # placement is done on reservation reservation_name: $(vars.static_reservation_name) machine_type: $(vars.static_reservation_machine_type) instance_image: $(vars.slurm_instance_image) diff --git a/examples/hpc-slurm.yaml b/examples/hpc-slurm.yaml index e381f1a9c0..b9e322d7fc 100644 --- a/examples/hpc-slurm.yaml +++ b/examples/hpc-slurm.yaml @@ -55,7 +55,6 @@ deployment_groups: settings: node_count_dynamic_max: 4 machine_type: n2-standard-2 - enable_placement: false # the default is: true allow_automatic_updates: false - id: debug_partition diff --git a/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-2-cluster.yaml b/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-2-cluster.yaml index d4bf5b3dd9..f014e9b546 100644 --- a/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-2-cluster.yaml +++ b/examples/machine-learning/a3-highgpu-8g/ml-slurm-a3-2-cluster.yaml @@ -96,7 +96,6 @@ deployment_groups: node_count_static: 0 node_count_dynamic_max: 4 machine_type: n2-standard-2 - enable_placement: false - id: debug_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition @@ -182,7 +181,6 @@ deployment_groups: settings: reservation_name: $(vars.a3_reservation_name) maintenance_interval: $(vars.a3_maintenance_interval) - enable_placement: false node_count_static: $(vars.a3_static_cluster_size) node_count_dynamic_max: 0 disk_type: pd-ssd diff --git a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml index 3b66916e35..39a2492e5c 100644 --- a/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml +++ b/examples/machine-learning/a3-megagpu-8g/slurm-a3mega-cluster.yaml @@ -21,7 +21,6 @@ vars: deployment_name: a3mega-cluster a3mega_partition_name: a3mega a3mega_maintenance_interval: "" - enable_placement: false remote_mount_homefs: /nfsshare local_mount_homefs: /home instance_image_custom: true diff --git a/examples/ml-slurm.yaml b/examples/ml-slurm.yaml index 07df8b8a29..3b7669ecfd 100644 --- a/examples/ml-slurm.yaml +++ b/examples/ml-slurm.yaml @@ -208,7 +208,6 @@ deployment_groups: use: [network] settings: node_count_dynamic_max: 20 - enable_placement: false bandwidth_tier: gvnic_enabled machine_type: g2-standard-4 instance_image: $(vars.new_image) diff --git a/examples/ps-slurm.yaml b/examples/ps-slurm.yaml index b646356522..313d43966f 100644 --- a/examples/ps-slurm.yaml +++ b/examples/ps-slurm.yaml @@ -59,7 +59,6 @@ deployment_groups: settings: node_count_dynamic_max: 4 machine_type: $(vars.compute_node_machine_type) - enable_placement: false # the default is: true allow_automatic_updates: false - id: debug_partition diff --git a/tools/cloud-build/daily-tests/blueprints/lustre-slurm.yaml b/tools/cloud-build/daily-tests/blueprints/lustre-slurm.yaml index d6ddeeadcc..d94774ffc9 100644 --- a/tools/cloud-build/daily-tests/blueprints/lustre-slurm.yaml +++ b/tools/cloud-build/daily-tests/blueprints/lustre-slurm.yaml @@ -23,8 +23,6 @@ vars: zone: us-central1-a machine_type: n2-standard-2 disk_type: pd-ssd - # enable_placement: false - # on_host_maintenance: MIGRATE num_nodes: 1 rocky_image: family: slurm-gcp-6-8-hpc-rocky-linux-8 diff --git a/tools/python-integration-tests/blueprints/topology-test.yaml b/tools/python-integration-tests/blueprints/topology-test.yaml index 8e779f57bd..176469c53a 100644 --- a/tools/python-integration-tests/blueprints/topology-test.yaml +++ b/tools/python-integration-tests/blueprints/topology-test.yaml @@ -36,7 +36,6 @@ deployment_groups: node_count_dynamic_max: 0 node_count_static: 5 allow_automatic_updates: false - enable_placement: true - id: partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition diff --git a/tools/validate_configs/golden_copies/configs/versioned_blueprint.yaml b/tools/validate_configs/golden_copies/configs/versioned_blueprint.yaml index 8ca6f6994d..707fea2107 100644 --- a/tools/validate_configs/golden_copies/configs/versioned_blueprint.yaml +++ b/tools/validate_configs/golden_copies/configs/versioned_blueprint.yaml @@ -108,7 +108,6 @@ deployment_groups: node_count_dynamic_max: 4 machine_type: n2-standard-2 instance_image: $(vars.slurm_image) - enable_placement: false # the default is: true allow_automatic_updates: false - id: n2_partition @@ -128,7 +127,6 @@ deployment_groups: node_count_dynamic_max: 20 machine_type: c2-standard-60 # this is the default instance_image: $(vars.slurm_image) - enable_placement: true bandwidth_tier: tier_1_enabled disk_type: pd-ssd disk_size_gb: 100 @@ -142,7 +140,7 @@ deployment_groups: settings: partition_name: c2 # the following two are true by default - exclusive: true # this must be true if nodeset.enable_placement is true + exclusive: true - id: c2d_nodeset source: community/modules/compute/schedmd-slurm-gcp-v6-nodeset diff --git a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml index 8f1f0045bf..5e61a4a478 100644 --- a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml +++ b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/.ghpc/artifacts/expanded_blueprint.yaml @@ -150,7 +150,6 @@ deployment_groups: - compute_sa settings: allow_automatic_updates: false - enable_placement: false instance_image: ((var.slurm_image)) instance_image_custom: ((var.instance_image_custom)) labels: ((var.labels)) @@ -185,7 +184,6 @@ deployment_groups: bandwidth_tier: tier_1_enabled disk_size_gb: 100 disk_type: pd-ssd - enable_placement: true instance_image: ((var.slurm_image)) instance_image_custom: ((var.instance_image_custom)) labels: ((var.labels)) diff --git a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/main.tf b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/main.tf index 7bc80c2cce..da44f438b6 100644 --- a/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/main.tf +++ b/tools/validate_configs/golden_copies/expectations/versioned_blueprint/primary/main.tf @@ -81,7 +81,6 @@ module "scratchfs" { module "n2_nodeset" { source = "github.com/GoogleCloudPlatform/cluster-toolkit//community/modules/compute/schedmd-slurm-gcp-v6-nodeset?ref=v1.38.0&depth=1" allow_automatic_updates = false - enable_placement = false instance_image = var.slurm_image instance_image_custom = var.instance_image_custom labels = var.labels @@ -112,7 +111,6 @@ module "c2_nodeset" { bandwidth_tier = "tier_1_enabled" disk_size_gb = 100 disk_type = "pd-ssd" - enable_placement = true instance_image = var.slurm_image instance_image_custom = var.instance_image_custom labels = var.labels diff --git a/tools/validate_configs/test_configs/config-ssh.yaml b/tools/validate_configs/test_configs/config-ssh.yaml index 5a8c81c86f..bb1ecb7a01 100644 --- a/tools/validate_configs/test_configs/config-ssh.yaml +++ b/tools/validate_configs/test_configs/config-ssh.yaml @@ -48,7 +48,6 @@ deployment_groups: settings: node_count_dynamic_max: 4 machine_type: n2-standard-2 - enable_placement: false # the default is: true - id: debug_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition diff --git a/tools/validate_configs/test_configs/gpu.yaml b/tools/validate_configs/test_configs/gpu.yaml index f12bd323f1..988e1e13b7 100644 --- a/tools/validate_configs/test_configs/gpu.yaml +++ b/tools/validate_configs/test_configs/gpu.yaml @@ -139,7 +139,6 @@ deployment_groups: use: [network_slurm] settings: name: nogpu - enable_placement: false node_count_dynamic_max: 4 machine_type: n2-standard-2 @@ -148,7 +147,6 @@ deployment_groups: use: [network_slurm] settings: name: man - enable_placement: false node_count_dynamic_max: 4 machine_type: a2-ultragpu-2g guest_accelerator: @@ -160,7 +158,6 @@ deployment_groups: use: [network_slurm] settings: name: auto - enable_placement: false node_count_dynamic_max: 4 machine_type: a2-ultragpu-2g diff --git a/tools/validate_configs/test_configs/slurm-gcp-v6-startup-scripts.yaml b/tools/validate_configs/test_configs/slurm-gcp-v6-startup-scripts.yaml index 77be3c92d6..42e7a840a1 100644 --- a/tools/validate_configs/test_configs/slurm-gcp-v6-startup-scripts.yaml +++ b/tools/validate_configs/test_configs/slurm-gcp-v6-startup-scripts.yaml @@ -61,7 +61,6 @@ deployment_groups: settings: node_count_dynamic_max: 4 machine_type: n2-standard-2 - enable_placement: false - id: debug_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition diff --git a/tools/validate_configs/test_configs/zone-policies-slurm.yaml b/tools/validate_configs/test_configs/zone-policies-slurm.yaml index bb1985141c..69a27fb59d 100644 --- a/tools/validate_configs/test_configs/zone-policies-slurm.yaml +++ b/tools/validate_configs/test_configs/zone-policies-slurm.yaml @@ -48,7 +48,6 @@ deployment_groups: node_count_dynamic_max: 4 machine_type: n2-standard-2 enable_public_ips: true - enable_placement: false - id: zonal_partition source: community/modules/compute/schedmd-slurm-gcp-v6-partition @@ -63,7 +62,6 @@ deployment_groups: settings: node_count_dynamic_max: 4 machine_type: n2-standard-2 - enable_placement: false zones: $(vars.additional_zones) - id: multizonal_partition