Skip to content

Commit

Permalink
Merge pull request #157 from giffels/add-tardis-env-to-slurm
Browse files Browse the repository at this point in the history
Export tardis environment variable via slurm site adapter
  • Loading branch information
giffels authored Dec 9, 2020
2 parents 25de866 + 69083b2 commit 6b04a31
Show file tree
Hide file tree
Showing 19 changed files with 341 additions and 45 deletions.
8 changes: 8 additions & 0 deletions docs/source/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,14 @@
CHANGELOG
#########

[Unreleased] - 2020-12-08
=========================

Added
-----

* Export tardis environment variable via slurm site adapter

[0.4.0] - 2020-06-03
====================

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
category: added
summary: "Export tardis environment variable via slurm site adapter"
description: |
Export the drone meta data like allocated cores, memory as well as the drone uuid to the actual batch jobs
representing the drone. This environment variables are then used to configure the OBS node accordingly.
In addition, the drone uuid can be used as STARTD_NAME in the HTCondor OBS in order to uniquely match drones to the
underlying resources in the OBS.
pull requests:
- 157
19 changes: 16 additions & 3 deletions tardis/adapters/batchsystems/fakebatchsystem.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from tardis.configuration.configuration import Configuration
from tardis.interfaces.batchsystemadapter import BatchSystemAdapter
from tardis.interfaces.batchsystemadapter import MachineStatus
from ...configuration.configuration import Configuration
from ...interfaces.batchsystemadapter import BatchSystemAdapter
from ...interfaces.batchsystemadapter import MachineStatus
from ...utilities.attributedict import AttributeDict


class FakeBatchSystemAdapter(BatchSystemAdapter):
Expand Down Expand Up @@ -97,3 +98,15 @@ async def get_utilisation(self, drone_uuid: str) -> float:
return self.fake_config.utilisation
else:
return utilisation

@property
def machine_meta_data_translation_mapping(self) -> AttributeDict:
"""
The machine meta data translation mapping is used to translate units of
the machine meta data in ``TARDIS`` to values expected by the
FakeBatchSystem adapter.
:return: Machine meta data translation mapping
:rtype: AttributeDict
"""
return AttributeDict(Cores=1, Memory=1, Disk=1)
12 changes: 12 additions & 0 deletions tardis/adapters/batchsystems/htcondor.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,3 +237,15 @@ async def get_utilisation(self, drone_uuid: str) -> float:
:rtype: float
"""
return min(await self.get_resource_ratios(drone_uuid), default=0.0)

@property
def machine_meta_data_translation_mapping(self) -> AttributeDict:
"""
The machine meta data translation mapping is used to translate units of
the machine meta data in ``TARDIS`` to values expected by the
HTCondor batch system adapter.
:return: Machine meta data translation mapping
:rtype: AttributeDict
"""
return AttributeDict(Cores=1, Memory=1024, Disk=1024 * 1024)
12 changes: 12 additions & 0 deletions tardis/adapters/batchsystems/slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,3 +247,15 @@ async def get_utilisation(self, drone_uuid: str) -> float:
:rtype: float
"""
return min(await self.get_resource_ratios(drone_uuid), default=0.0)

@property
def machine_meta_data_translation_mapping(self) -> AttributeDict:
"""
The machine meta data translation mapping is used to translate units of
the machine meta data in ``TARDIS`` to values expected by the
Slurm batch system adapter.
:return: Machine meta data translation mapping
:rtype: AttributeDict
"""
return AttributeDict(Cores=1, Memory=1000, Disk=1000)
21 changes: 6 additions & 15 deletions tardis/adapters/sites/htcondor.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,6 @@ async def htcondor_queue_updater(executor):
"7": ResourceStatus.Stopped,
}

htcondor_translate_resources_prefix = {"Cores": 1, "Memory": 1024, "Disk": 1024}


class HTCondorAdapter(SiteAdapter):
def __init__(self, machine_type: str, site_name: str):
Expand Down Expand Up @@ -101,22 +99,15 @@ async def deploy_resource(
with open(jdl_file, "r") as f:
jdl_template = Template(f.read())

try:
translated_meta_data = {
key: htcondor_translate_resources_prefix[key] * value
for key, value in self.machine_meta_data.items()
}
except KeyError as ke:
logger.critical(f"deploy_resource failed: no translation known for {ke}")
raise
else:
translated_meta_data["Uuid"] = resource_attributes.drone_uuid
drone_environment = self.drone_environment(
resource_attributes.drone_uuid,
resource_attributes.machine_meta_data_translation_mapping,
)

submit_jdl = jdl_template.substitute(
translated_meta_data,
drone_environment,
Environment=";".join(
f"TardisDrone{key}={value}"
for key, value in translated_meta_data.items()
f"TardisDrone{key}={value}" for key, value in drone_environment.items()
),
)

Expand Down
30 changes: 19 additions & 11 deletions tardis/adapters/sites/slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,6 @@ def __init__(self, machine_type: str, site_name: str):
)
self._startup_command = self._configuration.StartupCommand

self._sbatch_cmdline_option_string = slurm_cmd_option_formatter(
self.sbatch_cmdline_options
)

self._executor = getattr(self._configuration, "executor", ShellExecutor())

self._slurm_status = AsyncCacheMap(
Expand Down Expand Up @@ -113,10 +109,15 @@ async def deploy_resource(
self, resource_attributes: AttributeDict
) -> AttributeDict:

sbatch_cmdline_option_string = slurm_cmd_option_formatter(
self.sbatch_cmdline_options(
resource_attributes.drone_uuid,
resource_attributes.machine_meta_data_translation_mapping,
)
)

request_command = (
"sbatch "
f"{self._sbatch_cmdline_option_string} "
f"{self._startup_command}"
f"sbatch {sbatch_cmdline_option_string} {self._startup_command}"
)

result = await self._executor.run_command(request_command)
Expand All @@ -127,7 +128,6 @@ async def deploy_resource(
remote_resource_uuid=remote_resource_uuid,
created=datetime.now(),
updated=datetime.now(),
drone_uuid=self.drone_uuid(str(remote_resource_uuid)),
resource_status=ResourceStatus.Booting,
)
return resource_attributes
Expand Down Expand Up @@ -168,12 +168,20 @@ async def terminate_resource(self, resource_attributes: AttributeDict):
{"JobId": resource_attributes.remote_resource_uuid}, **resource_attributes
)

@property
def sbatch_cmdline_options(self):
def sbatch_cmdline_options(self, drone_uuid, machine_meta_data_translation_mapping):
sbatch_options = self.machine_type_configuration.get(
"SubmitOptions", AttributeDict()
)

walltime = self.machine_type_configuration.Walltime

drone_environment = ",".join(
f"TardisDrone{key}={value}"
for key, value in self.drone_environment(
drone_uuid, machine_meta_data_translation_mapping
).items()
)

return AttributeDict(
short=AttributeDict(
**sbatch_options.get("short", AttributeDict()),
Expand All @@ -185,7 +193,7 @@ def sbatch_cmdline_options(self):
long=AttributeDict(
**sbatch_options.get("long", AttributeDict()),
mem=f"{self.machine_meta_data.Memory}gb",
export=f"SLURM_Walltime={self.machine_type_configuration.Walltime}",
export=f"SLURM_Walltime={walltime},{drone_environment}",
),
)

Expand Down
5 changes: 5 additions & 0 deletions tardis/agents/batchsystemagent.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from ..interfaces.batchsystemadapter import BatchSystemAdapter
from ..interfaces.batchsystemadapter import MachineStatus
from ..utilities.attributedict import AttributeDict


class BatchSystemAgent(BatchSystemAdapter):
Expand All @@ -23,3 +24,7 @@ async def get_machine_status(self, drone_uuid: str) -> MachineStatus:

async def get_utilisation(self, drone_uuid: str) -> float:
return await self._batch_system_adapter.get_utilisation(drone_uuid)

@property
def machine_meta_data_translation_mapping(self) -> AttributeDict:
return self._batch_system_adapter.machine_meta_data_translation_mapping
90 changes: 84 additions & 6 deletions tardis/interfaces/batchsystemadapter.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from ..utilities.attributedict import AttributeDict

from abc import ABCMeta
from abc import abstractmethod
from enum import Enum
Expand All @@ -11,26 +13,102 @@ class MachineStatus(Enum):


class BatchSystemAdapter(metaclass=ABCMeta):
"""
Abstract base class defining the interface for BatchSystemAdapters which handles
integration and management of resources in the overlay batch system.
"""

@abstractmethod
async def disintegrate_machine(self, drone_uuid: str) -> None:
return NotImplemented
"""
Disintegrate a machine from the overlay batch system.
:param drone_uuid: Uuid of the worker node, for some sites corresponding
to the host name of the drone.
:type drone_uuid: str
:return: None
"""
raise NotImplementedError

@abstractmethod
async def drain_machine(self, drone_uuid: str) -> None:
return NotImplemented
"""
Drain a machine in the overlay batch system, which means that no new
jobs will be accepted
:param drone_uuid: Uuid of the worker node, for some sites corresponding
to the host name of the drone.
:type drone_uuid: str
:return: None
"""
raise NotImplementedError

@abstractmethod
async def integrate_machine(self, drone_uuid: str) -> None:
return NotImplemented
"""
Integrate a machine into the overlay batch system.
:param drone_uuid: Uuid of the worker node, for some sites corresponding
to the host name of the drone.
:type drone_uuid: str
:return: None
"""
raise NotImplementedError

@abstractmethod
async def get_allocation(self, drone_uuid: str) -> float:
return NotImplemented
"""
Get the allocation of a worker node in the overlay batch system, which is
defined as maximum of the ratios of requested over total resources
(CPU, Memory, Disk, etc.).
:param drone_uuid: Uuid of the worker node, for some sites corresponding
to the host name of the drone.
:type drone_uuid: str
:return: The allocation of a worker node as described above.
:rtype: float
"""
raise NotImplementedError

@abstractmethod
async def get_machine_status(self, drone_uuid: str) -> MachineStatus:
return NotImplemented
"""
Get the status of a worker node in the overlay batch system (Available,
Draining, Drained, NotAvailable)
:param drone_uuid: Uuid of the worker node, for some sites corresponding
to the host name of the drone.
:type drone_uuid: str
:return: The machine status in HTCondor (Available, Draining, Drained,
NotAvailable)
:rtype: MachineStatus
"""
raise NotImplementedError

@abstractmethod
async def get_utilisation(self, drone_uuid: str) -> float:
return NotImplemented
"""
Get the utilisation of a worker node in the overlay batch system, which
is defined as minimum of the ratios of requested over total resources
(CPU, Memory, Disk, etc.).
:param drone_uuid: Uuid of the worker node, for some sites corresponding
to the host name of the drone.
:type drone_uuid: str
:return: The utilisation of a worker node as described above.
:rtype: float
"""
raise NotImplementedError

@property
@abstractmethod
def machine_meta_data_translation_mapping(self) -> AttributeDict:
"""
The machine meta data translation mapping is used to translate units of
the machine meta data in ``TARDIS`` as expected by the overlay batch
system.
:return: machine meta data translation mapping
:rtype: AttributeDict
"""
raise NotImplementedError
33 changes: 33 additions & 0 deletions tardis/interfaces/siteadapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
from abc import ABCMeta, abstractmethod
from enum import Enum

import logging

logger = logging.getLogger("cobald.runtime.tardis.interfaces.site")


class ResourceStatus(Enum):
"""
Expand Down Expand Up @@ -113,6 +117,35 @@ def handle_response(

return translated_response

def drone_environment(
self, drone_uuid: str, meta_data_translation_mapping: AttributeDict
) -> dict:
"""
Method to get the drone environment to be exported to batch jobs
providing the actual resources in the overlay batch system. It
translates units of drone meta data into a format the overlay
batch system is expecting. Also, the drone_uuid is added for matching
drones to actual resources provided in the overlay batch system.
:param drone_uuid: The unique id which is assigned to every drone on creation
:type drone_uuid: str
:param meta_data_translation_mapping: Mapping used for the meta data translation
:type meta_data_translation_mapping: dict
:return: Translated
:rtype: dict
"""
try:
drone_environment = {
key: meta_data_translation_mapping[key] * value
for key, value in self.machine_meta_data.items()
}
except KeyError as ke:
logger.critical(f"drone_environment failed: no translation known for {ke}")
raise
else:
drone_environment["Uuid"] = drone_uuid

return drone_environment

@property
def drone_minimum_lifetime(self) -> [int, None]:
try:
Expand Down
1 change: 1 addition & 0 deletions tardis/resources/drone.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def __init__(
self.resource_attributes = AttributeDict(
site_name=self._site_agent.site_name,
machine_type=self.site_agent.machine_type,
machine_meta_data_translation_mapping=self.batch_system_agent.machine_meta_data_translation_mapping, # noqa B950
remote_resource_uuid=remote_resource_uuid,
created=created or datetime.now(),
updated=updated or datetime.now(),
Expand Down
6 changes: 6 additions & 0 deletions tests/adapters_t/batchsystems_t/test_fakebatchsystem.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,3 +64,9 @@ def test_get_utilisation(self):
self.config.BatchSystem.utilisation = AttributeDict(get_value=lambda: 0.9)
self.fake_adapter = FakeBatchSystemAdapter()
self.assertEqual(run_async(self.fake_adapter.get_utilisation, "test-123"), 0.9)

def test_machine_meta_data_translation_map(self):
self.assertEqual(
AttributeDict(Cores=1, Memory=1, Disk=1),
self.fake_adapter.machine_meta_data_translation_mapping,
)
Loading

0 comments on commit 6b04a31

Please sign in to comment.