Skip to content

Commit

Permalink
Add possibility to disable slurmdbd systemd service
Browse files Browse the repository at this point in the history
Signed-off-by: Hanwen <[email protected]>
  • Loading branch information
hanwen-cluster authored and hanwen-pcluste committed Apr 29, 2024
1 parent 4f822c8 commit 44b2a39
Show file tree
Hide file tree
Showing 4 changed files with 128 additions and 108 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,6 @@
# PMIX Version and Checksum
default['cluster']['pmix']['version'] = '4.2.9'
default['cluster']['pmix']['sha256'] = '00ddb36fb81c31519972079a218c3cdd903510fc3910abaf4d484068fa29e884'

# Slurmdbd
default['cluster']['slurmdbd_service_enabled'] == "true"
Original file line number Diff line number Diff line change
Expand Up @@ -66,49 +66,56 @@
command "#{node['cluster']['scripts_dir']}/slurm/update_slurm_database_password.sh"
end unless kitchen_test?

action = if node['cluster']['slurmdbd_service_enabled'] == "true"
%i(enable start)
else
%i(disable)
end
service "slurmdbd" do
supports restart: false
action %i(enable start)
action action
end unless on_docker?

# After starting slurmdbd the database may not be fully responsive yet and
# its bootstrapping may fail. We need to wait for sacctmgr to successfully
# query the database before proceeding.
# In case of an external slurmdbd the Slurm commands do not work, so this
# check cannot be executed.
execute "wait for slurm database" do
command "#{node['cluster']['slurm']['install_dir']}/bin/sacctmgr show clusters -Pn"
retries node['cluster']['slurmdbd_response_retries']
retry_delay 10
end unless kitchen_test? || (node['cluster']['node_type'] == "ExternalSlurmDbd")
if node['cluster']['slurmdbd_service_enabled'] == "true"
# After starting slurmdbd the database may not be fully responsive yet and
# its bootstrapping may fail. We need to wait for sacctmgr to successfully
# query the database before proceeding.
# In case of an external slurmdbd the Slurm commands do not work, so this
# check cannot be executed.
execute "wait for slurm database" do
command "#{node['cluster']['slurm']['install_dir']}/bin/sacctmgr show clusters -Pn"
retries node['cluster']['slurmdbd_response_retries']
retry_delay 10
end unless kitchen_test? || (node['cluster']['node_type'] == "ExternalSlurmDbd")

bash "bootstrap slurm database" do
user 'root'
group 'root'
code <<-BOOTSTRAP
SACCTMGR_CMD=#{node['cluster']['slurm']['install_dir']}/bin/sacctmgr
CLUSTER_NAME=#{node['cluster']['stack_name']}
DEF_ACCOUNT=pcdefault
SLURM_USER=#{node['cluster']['slurm']['user']}
DEF_USER=#{node['cluster']['cluster_user']}
bash "bootstrap slurm database" do
user 'root'
group 'root'
code <<-BOOTSTRAP
SACCTMGR_CMD=#{node['cluster']['slurm']['install_dir']}/bin/sacctmgr
CLUSTER_NAME=#{node['cluster']['stack_name']}
DEF_ACCOUNT=pcdefault
SLURM_USER=#{node['cluster']['slurm']['user']}
DEF_USER=#{node['cluster']['cluster_user']}
# Add cluster to database if it is not present yet
[[ $($SACCTMGR_CMD show clusters -Pn cluster=$CLUSTER_NAME | grep $CLUSTER_NAME) ]] || \
$SACCTMGR_CMD -iQ add cluster $CLUSTER_NAME
# Add cluster to database if it is not present yet
[[ $($SACCTMGR_CMD show clusters -Pn cluster=$CLUSTER_NAME | grep $CLUSTER_NAME) ]] || \
$SACCTMGR_CMD -iQ add cluster $CLUSTER_NAME
# Add account-cluster association to database if it is not present yet
[[ $($SACCTMGR_CMD list associations -Pn cluster=$CLUSTER_NAME account=$DEF_ACCOUNT format=account | grep $DEF_ACCOUNT) ]] || \
$SACCTMGR_CMD -iQ add account $DEF_ACCOUNT Cluster=$CLUSTER_NAME \
Description="ParallelCluster default account" Organization="none"
# Add account-cluster association to database if it is not present yet
[[ $($SACCTMGR_CMD list associations -Pn cluster=$CLUSTER_NAME account=$DEF_ACCOUNT format=account | grep $DEF_ACCOUNT) ]] || \
$SACCTMGR_CMD -iQ add account $DEF_ACCOUNT Cluster=$CLUSTER_NAME \
Description="ParallelCluster default account" Organization="none"
# Add user-account associations to database if they are not present yet
[[ $($SACCTMGR_CMD list associations -Pn cluster=$CLUSTER_NAME account=$DEF_ACCOUNT user=$SLURM_USER format=user | grep $SLURM_USER) ]] || \
$SACCTMGR_CMD -iQ add user $SLURM_USER Account=$DEF_ACCOUNT AdminLevel=Admin
[[ $($SACCTMGR_CMD list associations -Pn cluster=$CLUSTER_NAME account=$DEF_ACCOUNT user=$DEF_USER format=user | grep $DEF_USER) ]] || \
$SACCTMGR_CMD -iQ add user $DEF_USER Account=$DEF_ACCOUNT AdminLevel=Admin
# Add user-account associations to database if they are not present yet
[[ $($SACCTMGR_CMD list associations -Pn cluster=$CLUSTER_NAME account=$DEF_ACCOUNT user=$SLURM_USER format=user | grep $SLURM_USER) ]] || \
$SACCTMGR_CMD -iQ add user $SLURM_USER Account=$DEF_ACCOUNT AdminLevel=Admin
[[ $($SACCTMGR_CMD list associations -Pn cluster=$CLUSTER_NAME account=$DEF_ACCOUNT user=$DEF_USER format=user | grep $DEF_USER) ]] || \
$SACCTMGR_CMD -iQ add user $DEF_USER Account=$DEF_ACCOUNT AdminLevel=Admin
# sacctmgr might throw errors if the DEF_ACCOUNT is not associated to a cluster already defined on the database.
# This is not important for the scope of this script, so we return 0.
exit 0
BOOTSTRAP
end unless kitchen_test? || (node['cluster']['node_type'] == "ExternalSlurmDbd")
# sacctmgr might throw errors if the DEF_ACCOUNT is not associated to a cluster already defined on the database.
# This is not important for the scope of this script, so we return 0.
exit 0
BOOTSTRAP
end unless kitchen_test? || (node['cluster']['node_type'] == "ExternalSlurmDbd")
end
Original file line number Diff line number Diff line change
Expand Up @@ -3,83 +3,92 @@
describe 'aws-parallelcluster-slurm::config_slurm_accounting' do
for_all_oses do |platform, version|
context "on #{platform}#{version}" do
cached(:chef_run) do
runner = runner(platform: platform, version: version) do
allow_any_instance_of(Object).to receive(:are_mount_or_unmount_required?).and_return(false)
allow_any_instance_of(Object).to receive(:dig).and_return(true)
RSpec::Mocks.configuration.allow_message_expectations_on_nil = true
end
runner.converge(described_recipe)
end
cached(:node) { chef_run.node }

it 'creates the service definition for slurmdbd' do
is_expected.to create_template('/etc/systemd/system/slurmdbd.service').with(
source: 'slurm/head_node/slurmdbd.service.erb',
owner: 'root',
group: 'root',
mode: '0644'
)
end

it 'creates the service definition for slurmdbd with the correct settings' do
is_expected.to render_file('/etc/systemd/system/slurmdbd.service')
.with_content("After=network-online.target munge.service mysql.service mysqld.service mariadb.service remote-fs.target")
end
%w(true false).each do |enable_service|
context "when service enabled is #{enable_service}" do
cached(:chef_run) do
runner = runner(platform: platform, version: version) do |node|
allow_any_instance_of(Object).to receive(:are_mount_or_unmount_required?).and_return(false)
allow_any_instance_of(Object).to receive(:dig).and_return(true)
RSpec::Mocks.configuration.allow_message_expectations_on_nil = true
node.override['cluster']['slurmdbd_service_enabled'] = enable_service
end
runner.converge(described_recipe)
end
cached(:node) { chef_run.node }

it 'creates the slurmdbd configuration files' do
slurm_install_dir = "#{node['cluster']['slurm']['install_dir']}"
slurm_user = "#{node['cluster']['slurm']['user']}"
slurm_group = "#{node['cluster']['slurm']['group']}"
is_expected.to create_template("/etc/systemd/system/slurmdbd.service").with(
source: 'slurm/head_node/slurmdbd.service.erb',
user: 'root',
group: 'root',
mode: '0644'
)
is_expected.to create_template_if_missing("#{slurm_install_dir}/etc/slurmdbd.conf").with(
source: 'slurm/slurmdbd.conf.erb',
user: slurm_user,
group: slurm_group,
mode: '0600'
)
is_expected.to create_file("#{slurm_install_dir}/etc/slurm_parallelcluster_slurmdbd.conf").with(
user: slurm_user,
group: slurm_group,
mode: '0600'
)
end
it 'creates the service definition for slurmdbd' do
is_expected.to create_template('/etc/systemd/system/slurmdbd.service').with(
source: 'slurm/head_node/slurmdbd.service.erb',
owner: 'root',
group: 'root',
mode: '0644'
)
end

it 'creates the Slurm database password update script' do
is_expected.to create_template("#{node['cluster']['scripts_dir']}/slurm/update_slurm_database_password.sh").with(
source: 'slurm/head_node/update_slurm_database_password.sh.erb',
user: 'root',
group: 'root',
mode: '0700'
)
end
it 'creates the service definition for slurmdbd with the correct settings' do
is_expected.to render_file('/etc/systemd/system/slurmdbd.service')
.with_content("After=network-online.target munge.service mysql.service mysqld.service mariadb.service remote-fs.target")
end

it 'executes the Slurm database password update scripts' do
is_expected.to run_execute("update Slurm database password").with(
command: "#{node['cluster']['scripts_dir']}/slurm/update_slurm_database_password.sh",
user: "root",
group: "root"
)
end
it 'creates the slurmdbd configuration files' do
slurm_install_dir = "#{node['cluster']['slurm']['install_dir']}"
slurm_user = "#{node['cluster']['slurm']['user']}"
slurm_group = "#{node['cluster']['slurm']['group']}"
is_expected.to create_template("/etc/systemd/system/slurmdbd.service").with(
source: 'slurm/head_node/slurmdbd.service.erb',
user: 'root',
group: 'root',
mode: '0644'
)
is_expected.to create_template_if_missing("#{slurm_install_dir}/etc/slurmdbd.conf").with(
source: 'slurm/slurmdbd.conf.erb',
user: slurm_user,
group: slurm_group,
mode: '0600'
)
is_expected.to create_file("#{slurm_install_dir}/etc/slurm_parallelcluster_slurmdbd.conf").with(
user: slurm_user,
group: slurm_group,
mode: '0600'
)
end

it 'starts the slurm database daemon' do
is_expected.to enable_service("slurmdbd")
is_expected.to start_service("slurmdbd")
end
it 'creates the Slurm database password update script' do
is_expected.to create_template("#{node['cluster']['scripts_dir']}/slurm/update_slurm_database_password.sh").with(
source: 'slurm/head_node/update_slurm_database_password.sh.erb',
user: 'root',
group: 'root',
mode: '0700'
)
end

it "waits for the Slurm database to respond" do
is_expected.to run_execute("wait for slurm database").with(
command: "#{node['cluster']['slurm']['install_dir']}/bin/sacctmgr show clusters -Pn"
)
end
it 'executes the Slurm database password update scripts' do
is_expected.to run_execute("update Slurm database password").with(
command: "#{node['cluster']['scripts_dir']}/slurm/update_slurm_database_password.sh",
user: "root",
group: "root"
)
end
if enable_service == "true"
it 'starts the slurm database daemon' do
is_expected.to enable_service("slurmdbd")
is_expected.to start_service("slurmdbd")
end
it "waits for the Slurm database to respond" do
is_expected.to run_execute("wait for slurm database").with(
command: "#{node['cluster']['slurm']['install_dir']}/bin/sacctmgr show clusters -Pn"
)
end

it "bootstraps the Slurm database idempotently" do
is_expected.to run_bash("bootstrap slurm database")
it "bootstraps the Slurm database idempotently" do
is_expected.to run_bash("bootstrap slurm database")
end
else
it 'disables the slurm database daemon' do
is_expected.to disable_service("slurmdbd")
end
end
end
end
end
end
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ EnvironmentFile=-/etc/sysconfig/slurmdbd
ExecStart=<%= node['cluster']['slurm']['install_dir'] %>/sbin/slurmdbd -D -s $SLURMDBD_OPTIONS
ExecReload=/bin/kill -HUP $MAINPID
LimitNOFILE=65536
TimeoutStartSec=5000

[Install]
WantedBy=multi-user.target

0 comments on commit 44b2a39

Please sign in to comment.