From 8d4e0d706a2fcde23c5c1ef00cbb0745db9be7eb Mon Sep 17 00:00:00 2001
From: vikram singh shekhawat <vshekhawat@habana.ai>
Date: Tue, 20 Feb 2024 14:27:26 +0200
Subject: [PATCH 1/2] Add HPU accelerator support in unit tests.

This commit enabled HPU support in deepspeed unit tests.
---
 tests/unit/common.py | 25 ++++++++++++++++++++-----
 tests/unit/util.py   |  3 +++
 2 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/tests/unit/common.py b/tests/unit/common.py
index 76bebf6b725a..7d8d9007dff0 100644
--- a/tests/unit/common.py
+++ b/tests/unit/common.py
@@ -23,7 +23,7 @@
 from _pytest.fixtures import FixtureLookupError, FixtureFunctionMarker
 
 # Worker timeout for tests that hang
-DEEPSPEED_TEST_TIMEOUT = 600
+DEEPSPEED_TEST_TIMEOUT = int(os.environ.get('DEEPSPEED_TEST_TIMEOUT', '600'))
 
 
 def is_rocm_pytorch():
@@ -81,6 +81,11 @@ def set_accelerator_visible():
                 match = re.search('Device Type.*GPU', line)
                 if match:
                     num_accelerators += 1
+        elif get_accelerator().device_name() == 'hpu':
+            hl_smi = subprocess.check_output(['hl-smi', "-L"])
+            num_accelerators = re.findall(r"Module ID\s+:\s+(\d+)", hl_smi.decode())
+            num_accelerators = sorted(num_accelerators, key=int)
+            os.environ["HABANA_VISIBLE_MODULES"] = ",".join(num_accelerators)
         elif get_accelerator().device_name() == 'npu':
             npu_smi = subprocess.check_output(['npu-smi', 'info', '-l'])
             num_accelerators = int(npu_smi.decode('utf-8').strip().split('\n')[0].split(':')[1].strip())
@@ -90,7 +95,10 @@ def set_accelerator_visible():
                 subprocess.check_output('cat /proc/cpuinfo | grep "physical id" | sort -u | wc -l', shell=True))
             num_accelerators = cpu_sockets
 
-        cuda_visible = ",".join(map(str, range(num_accelerators)))
+        if isinstance(num_accelerators, list):
+            cuda_visible = ",".join(num_accelerators)
+        else:
+            cuda_visible = ",".join(map(str, range(num_accelerators)))
 
     # rotate list based on xdist worker id, example below
     # wid=0 -> ['0', '1', '2', '3']
@@ -149,6 +157,12 @@ def _get_fixture_kwargs(self, request, func):
     def _launch_daemonic_procs(self, num_procs):
         # Create process pool or use cached one
         master_port = None
+
+        if get_accelerator().device_name() == 'hpu':
+            if self.reuse_dist_env:
+                print("Ignoring reuse_dist_env for hpu")
+                self.reuse_dist_env = False
+
         if self.reuse_dist_env:
             if num_procs not in self._pool_cache:
                 self._pool_cache[num_procs] = mp.Pool(processes=num_procs)
@@ -169,9 +183,10 @@ def _launch_daemonic_procs(self, num_procs):
             # usually means an environment error and the rest of tests will
             # hang (causing super long unit test runtimes)
             pytest.exit("Test hanged, exiting", returncode=1)
-
-        # Tear down distributed environment and close process pools
-        self._close_pool(pool, num_procs)
+        finally:
+            # Regardless of the outcome, ensure proper teardown
+            # Tear down distributed environment and close process pools
+            self._close_pool(pool, num_procs)
 
         # If we skipped a test, propagate that to this process
         if any(skip_msgs):
diff --git a/tests/unit/util.py b/tests/unit/util.py
index 75c3000bd4a2..e8e0f476371b 100644
--- a/tests/unit/util.py
+++ b/tests/unit/util.py
@@ -47,11 +47,14 @@ def bf16_required_version_check(accelerator_check=True):
     cuda_version_available = CUDA_MAJOR >= 11
     nccl_version_available = NCCL_MAJOR > 2 or (NCCL_MAJOR == 2 and NCCL_MINOR >= 10)
     npu_available = get_accelerator().device_name() == 'npu'
+    hpu_available = get_accelerator().device_name() == 'hpu'
 
     if torch_version_available and cuda_version_available and nccl_version_available and accelerator_pass:
         return True
     elif npu_available:
         return True
+    elif hpu_available:
+        return True
     else:
         return False
 

From 7eb5f78b2789a19a0cf1635b151f426b0ae0b734 Mon Sep 17 00:00:00 2001
From: vikram singh shekhawat <vshekhawat@habana.ai>
Date: Fri, 1 Mar 2024 06:46:10 +0200
Subject: [PATCH 2/2] Update timeout variable for DeepSpeed tests.

Changed the environment variable for specifying the DeepSpeed
test timeout from 'DEEPSPEED_TEST_TIMEOUT' to 'DS_UNITTEST_TIMEOUT'
to align with the naming convention.
---
 tests/unit/common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/common.py b/tests/unit/common.py
index 7d8d9007dff0..d9fa5e041207 100644
--- a/tests/unit/common.py
+++ b/tests/unit/common.py
@@ -23,7 +23,7 @@
 from _pytest.fixtures import FixtureLookupError, FixtureFunctionMarker
 
 # Worker timeout for tests that hang
-DEEPSPEED_TEST_TIMEOUT = int(os.environ.get('DEEPSPEED_TEST_TIMEOUT', '600'))
+DEEPSPEED_TEST_TIMEOUT = int(os.environ.get('DS_UNITTEST_TIMEOUT', '600'))
 
 
 def is_rocm_pytorch():