From c16de51d39bc0262f6f44c2155c720db9f245097 Mon Sep 17 00:00:00 2001
From: Logan Adams <loadams@microsoft.com>
Date: Mon, 16 Dec 2024 16:51:03 -0800
Subject: [PATCH 1/9] set gelu correctly

---
 tests/unit/ops/transformer/inference/test_gelu.py | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/tests/unit/ops/transformer/inference/test_gelu.py b/tests/unit/ops/transformer/inference/test_gelu.py
index 5f820ef3b579..782e03565cbf 100644
--- a/tests/unit/ops/transformer/inference/test_gelu.py
+++ b/tests/unit/ops/transformer/inference/test_gelu.py
@@ -9,6 +9,7 @@
 from deepspeed.ops.op_builder import InferenceBuilder
 from deepspeed.ops.transformer import DeepSpeedInferenceConfig
 from deepspeed.ops.transformer.inference.op_binding.bias_gelu import BiasGeluOp
+from deepspeed.utils.torch import required_torch_version
 
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
     pytest.skip("Inference ops are not available on this system", allow_module_level=True)
@@ -23,15 +24,11 @@ def allclose(x, y):
 
 
 def version_appropriate_gelu(activations):
-    global torch_minor_version
-    if torch_minor_version is None:
-        torch_minor_version = int(torch.__version__.split('.')[1])
-    # If torch version = 1.12
-    if torch_minor_version < 12:
-        return torch.nn.functional.gelu(activations)
-    else:
+    # gelu behavior changes (correctly) in torch 1.12
+    if required_torch_version(min_version=1.12):
         return torch.nn.functional.gelu(activations, approximate='tanh')
-
+    else:
+        return torch.nn.functional.gelu(activations)
 
 def run_gelu_reference(activations):
     # Expected behavior is that of casting to float32 internally and using the tanh approximation

From db6851fc9c34b2dde3c05b38cef198964ff12778 Mon Sep 17 00:00:00 2001
From: Logan Adams <loadams@microsoft.com>
Date: Mon, 16 Dec 2024 16:59:47 -0800
Subject: [PATCH 2/9] Remove all instances of torch_minor

---
 tests/unit/ops/transformer/inference/test_bias_geglu.py | 2 --
 tests/unit/ops/transformer/inference/test_bias_gelu.py  | 2 --
 tests/unit/ops/transformer/inference/test_bias_relu.py  | 2 --
 tests/unit/ops/transformer/inference/test_gelu.py       | 2 --
 tests/unit/ops/transformer/inference/test_matmul.py     | 1 -
 tests/unit/ops/transformer/inference/test_softmax.py    | 2 --
 6 files changed, 11 deletions(-)

diff --git a/tests/unit/ops/transformer/inference/test_bias_geglu.py b/tests/unit/ops/transformer/inference/test_bias_geglu.py
index 05de4fbb4cf8..c995d2a8c46d 100644
--- a/tests/unit/ops/transformer/inference/test_bias_geglu.py
+++ b/tests/unit/ops/transformer/inference/test_bias_geglu.py
@@ -15,8 +15,6 @@
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
     pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
-torch_minor_version = None
-
 
 def run_bias_geglu_reference(activations, bias):
     # Expected behavior is that of casting to float32 internally
diff --git a/tests/unit/ops/transformer/inference/test_bias_gelu.py b/tests/unit/ops/transformer/inference/test_bias_gelu.py
index b69030e87ace..e3a3bad63961 100644
--- a/tests/unit/ops/transformer/inference/test_bias_gelu.py
+++ b/tests/unit/ops/transformer/inference/test_bias_gelu.py
@@ -16,8 +16,6 @@
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
     pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
-torch_minor_version = None
-
 
 def run_bias_gelu_reference(activations, bias):
     # Expected behavior is that of casting to float32 internally and using the tanh approximation
diff --git a/tests/unit/ops/transformer/inference/test_bias_relu.py b/tests/unit/ops/transformer/inference/test_bias_relu.py
index 57134665b241..69078f9f7646 100644
--- a/tests/unit/ops/transformer/inference/test_bias_relu.py
+++ b/tests/unit/ops/transformer/inference/test_bias_relu.py
@@ -15,8 +15,6 @@
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
     pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
-torch_minor_version = None
-
 
 def run_bias_relu_reference(activations, bias):
     # Expected behavior is that of casting to float32 internally
diff --git a/tests/unit/ops/transformer/inference/test_gelu.py b/tests/unit/ops/transformer/inference/test_gelu.py
index 782e03565cbf..54f762c6b232 100644
--- a/tests/unit/ops/transformer/inference/test_gelu.py
+++ b/tests/unit/ops/transformer/inference/test_gelu.py
@@ -14,8 +14,6 @@
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
     pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
-torch_minor_version = None
-
 
 def allclose(x, y):
     assert x.dtype == y.dtype
diff --git a/tests/unit/ops/transformer/inference/test_matmul.py b/tests/unit/ops/transformer/inference/test_matmul.py
index 559aa2c60afe..2ab195ee0115 100644
--- a/tests/unit/ops/transformer/inference/test_matmul.py
+++ b/tests/unit/ops/transformer/inference/test_matmul.py
@@ -12,7 +12,6 @@
     pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
 inference_module = None
-torch_minor_version = None
 
 
 def allclose(x, y):
diff --git a/tests/unit/ops/transformer/inference/test_softmax.py b/tests/unit/ops/transformer/inference/test_softmax.py
index e582be1b926a..83785ac38ebb 100644
--- a/tests/unit/ops/transformer/inference/test_softmax.py
+++ b/tests/unit/ops/transformer/inference/test_softmax.py
@@ -11,8 +11,6 @@
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
     pytest.skip("Inference ops are not available on this system", allow_module_level=True)
 
-torch_minor_version = None
-
 
 def allclose(x, y):
     assert x.dtype == y.dtype

From 0bd98e462c94c4883feb9246a97659b3c6bf11eb Mon Sep 17 00:00:00 2001
From: Logan Adams <loadams@microsoft.com>
Date: Mon, 16 Dec 2024 17:06:26 -0800
Subject: [PATCH 3/9] Test with the same gelu always

---
 tests/unit/ops/transformer/inference/test_gelu.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/tests/unit/ops/transformer/inference/test_gelu.py b/tests/unit/ops/transformer/inference/test_gelu.py
index 54f762c6b232..62338ff384f1 100644
--- a/tests/unit/ops/transformer/inference/test_gelu.py
+++ b/tests/unit/ops/transformer/inference/test_gelu.py
@@ -23,10 +23,7 @@ def allclose(x, y):
 
 def version_appropriate_gelu(activations):
     # gelu behavior changes (correctly) in torch 1.12
-    if required_torch_version(min_version=1.12):
-        return torch.nn.functional.gelu(activations, approximate='tanh')
-    else:
-        return torch.nn.functional.gelu(activations)
+    return torch.nn.functional.gelu(activations)
 
 def run_gelu_reference(activations):
     # Expected behavior is that of casting to float32 internally and using the tanh approximation

From ed5fb1f23d3ea704366482527311946609c2c2a7 Mon Sep 17 00:00:00 2001
From: Logan Adams <loadams@microsoft.com>
Date: Tue, 17 Dec 2024 09:54:54 -0800
Subject: [PATCH 4/9] Add skip with correct logic

---
 tests/unit/ops/transformer/inference/test_bias_gelu.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/unit/ops/transformer/inference/test_bias_gelu.py b/tests/unit/ops/transformer/inference/test_bias_gelu.py
index e3a3bad63961..5a82b95dd87b 100644
--- a/tests/unit/ops/transformer/inference/test_bias_gelu.py
+++ b/tests/unit/ops/transformer/inference/test_bias_gelu.py
@@ -10,8 +10,8 @@
 from deepspeed.ops.op_builder import InferenceBuilder
 from deepspeed.ops.transformer import DeepSpeedInferenceConfig
 from deepspeed.ops.transformer.inference.op_binding.bias_gelu import BiasGeluOp
+from deepspeed.utils.torch import required_torch_version
 from .inference_test_utils import allclose, get_dtypes
-from packaging import version as pkg_version
 
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
     pytest.skip("Inference ops are not available on this system", allow_module_level=True)
@@ -34,7 +34,7 @@ def run_bias_gelu_ds(activations, bias):
 @pytest.mark.parametrize("channels", [512, 1232, 4096])
 @pytest.mark.parametrize("dtype", get_dtypes())
 def test_bias_gelu(batch, sequence, channels, dtype):
-    if pkg_version.parse(torch.__version__) < pkg_version.parse("1.12"):
+    if required_torch_version(min_version=1.12):
         pytest.skip("gelu implementation matches only after torch 1.12")
 
     activations_ds = torch.randn((batch, sequence, channels), dtype=dtype, device=get_accelerator().device_name())

From 63d434c8a1fafeb9887efd145f1b1ebc6642da1a Mon Sep 17 00:00:00 2001
From: Logan Adams <loadams@microsoft.com>
Date: Tue, 17 Dec 2024 10:02:28 -0800
Subject: [PATCH 5/9] Switch to not check for skip

---
 tests/unit/ops/transformer/inference/test_bias_gelu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/ops/transformer/inference/test_bias_gelu.py b/tests/unit/ops/transformer/inference/test_bias_gelu.py
index 5a82b95dd87b..f0a09245e890 100644
--- a/tests/unit/ops/transformer/inference/test_bias_gelu.py
+++ b/tests/unit/ops/transformer/inference/test_bias_gelu.py
@@ -34,7 +34,7 @@ def run_bias_gelu_ds(activations, bias):
 @pytest.mark.parametrize("channels", [512, 1232, 4096])
 @pytest.mark.parametrize("dtype", get_dtypes())
 def test_bias_gelu(batch, sequence, channels, dtype):
-    if required_torch_version(min_version=1.12):
+    if not required_torch_version(min_version=1.12):
         pytest.skip("gelu implementation matches only after torch 1.12")
 
     activations_ds = torch.randn((batch, sequence, channels), dtype=dtype, device=get_accelerator().device_name())

From 1ac687c57933c4f4e876f472f654ffc4e4f31ac0 Mon Sep 17 00:00:00 2001
From: Logan Adams <loadams@microsoft.com>
Date: Tue, 17 Dec 2024 10:17:50 -0800
Subject: [PATCH 6/9] Revert "Switch to not check for skip"

This reverts commit 63d434c8a1fafeb9887efd145f1b1ebc6642da1a.
---
 tests/unit/ops/transformer/inference/test_bias_gelu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/ops/transformer/inference/test_bias_gelu.py b/tests/unit/ops/transformer/inference/test_bias_gelu.py
index f0a09245e890..5a82b95dd87b 100644
--- a/tests/unit/ops/transformer/inference/test_bias_gelu.py
+++ b/tests/unit/ops/transformer/inference/test_bias_gelu.py
@@ -34,7 +34,7 @@ def run_bias_gelu_ds(activations, bias):
 @pytest.mark.parametrize("channels", [512, 1232, 4096])
 @pytest.mark.parametrize("dtype", get_dtypes())
 def test_bias_gelu(batch, sequence, channels, dtype):
-    if not required_torch_version(min_version=1.12):
+    if required_torch_version(min_version=1.12):
         pytest.skip("gelu implementation matches only after torch 1.12")
 
     activations_ds = torch.randn((batch, sequence, channels), dtype=dtype, device=get_accelerator().device_name())

From f0bd65e7f6eaa96decb76f544ba85536879e5466 Mon Sep 17 00:00:00 2001
From: Logan Adams <loadams@microsoft.com>
Date: Tue, 17 Dec 2024 10:17:54 -0800
Subject: [PATCH 7/9] Revert "Add skip with correct logic"

This reverts commit ed5fb1f23d3ea704366482527311946609c2c2a7.
---
 tests/unit/ops/transformer/inference/test_bias_gelu.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/unit/ops/transformer/inference/test_bias_gelu.py b/tests/unit/ops/transformer/inference/test_bias_gelu.py
index 5a82b95dd87b..e3a3bad63961 100644
--- a/tests/unit/ops/transformer/inference/test_bias_gelu.py
+++ b/tests/unit/ops/transformer/inference/test_bias_gelu.py
@@ -10,8 +10,8 @@
 from deepspeed.ops.op_builder import InferenceBuilder
 from deepspeed.ops.transformer import DeepSpeedInferenceConfig
 from deepspeed.ops.transformer.inference.op_binding.bias_gelu import BiasGeluOp
-from deepspeed.utils.torch import required_torch_version
 from .inference_test_utils import allclose, get_dtypes
+from packaging import version as pkg_version
 
 if not deepspeed.ops.__compatible_ops__[InferenceBuilder.NAME]:
     pytest.skip("Inference ops are not available on this system", allow_module_level=True)
@@ -34,7 +34,7 @@ def run_bias_gelu_ds(activations, bias):
 @pytest.mark.parametrize("channels", [512, 1232, 4096])
 @pytest.mark.parametrize("dtype", get_dtypes())
 def test_bias_gelu(batch, sequence, channels, dtype):
-    if required_torch_version(min_version=1.12):
+    if pkg_version.parse(torch.__version__) < pkg_version.parse("1.12"):
         pytest.skip("gelu implementation matches only after torch 1.12")
 
     activations_ds = torch.randn((batch, sequence, channels), dtype=dtype, device=get_accelerator().device_name())

From 3d58fafbcbedb7b64d4bf76184436e2ecfe541a7 Mon Sep 17 00:00:00 2001
From: Logan Adams <loadams@microsoft.com>
Date: Tue, 17 Dec 2024 10:17:56 -0800
Subject: [PATCH 8/9] Revert "Test with the same gelu always"

This reverts commit 0bd98e462c94c4883feb9246a97659b3c6bf11eb.
---
 tests/unit/ops/transformer/inference/test_gelu.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/unit/ops/transformer/inference/test_gelu.py b/tests/unit/ops/transformer/inference/test_gelu.py
index 62338ff384f1..54f762c6b232 100644
--- a/tests/unit/ops/transformer/inference/test_gelu.py
+++ b/tests/unit/ops/transformer/inference/test_gelu.py
@@ -23,7 +23,10 @@ def allclose(x, y):
 
 def version_appropriate_gelu(activations):
     # gelu behavior changes (correctly) in torch 1.12
-    return torch.nn.functional.gelu(activations)
+    if required_torch_version(min_version=1.12):
+        return torch.nn.functional.gelu(activations, approximate='tanh')
+    else:
+        return torch.nn.functional.gelu(activations)
 
 def run_gelu_reference(activations):
     # Expected behavior is that of casting to float32 internally and using the tanh approximation

From bb1c3c4dc1d2c54f7d3a0ba22765b6cd070d66f4 Mon Sep 17 00:00:00 2001
From: Logan Adams <loadams@microsoft.com>
Date: Tue, 17 Dec 2024 10:22:08 -0800
Subject: [PATCH 9/9] Formatting

---
 tests/unit/ops/transformer/inference/test_gelu.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/unit/ops/transformer/inference/test_gelu.py b/tests/unit/ops/transformer/inference/test_gelu.py
index 54f762c6b232..a58abfdb100c 100644
--- a/tests/unit/ops/transformer/inference/test_gelu.py
+++ b/tests/unit/ops/transformer/inference/test_gelu.py
@@ -28,6 +28,7 @@ def version_appropriate_gelu(activations):
     else:
         return torch.nn.functional.gelu(activations)
 
+
 def run_gelu_reference(activations):
     # Expected behavior is that of casting to float32 internally and using the tanh approximation
     return version_appropriate_gelu(activations.to(torch.float32)).to(activations.dtype)