Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add log for UT random issues to debug only #1250

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
204 changes: 201 additions & 3 deletions test/xpu/test_ops_xpu.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,213 @@
# Owner(s): ["module: intel"]

Check warning on line 1 in test/xpu/test_ops_xpu.py

View workflow job for this annotation

GitHub Actions / preci-lint-check

PYFMT format

Run `lintrunner -a` to apply this patch.


from torch.testing._internal.common_device_type import instantiate_device_type_tests
from torch.testing._internal.common_utils import run_tests
import torch
import torch._prims as prims
import torch.utils._pytree as pytree
import warnings
from torch.testing._internal.common_device_type import instantiate_device_type_tests, onlyXPU, OpDTypes, ops
from torch.testing._internal.common_utils import run_tests, slowTest, suppress_warnings

try:
from xpu_test_utils import XPUPatchForImport
except Exception as e:
from .xpu_test_utils import XPUPatchForImport
with XPUPatchForImport(False):
from test_ops import TestCommon, TestMathBits
from test_ops import (
_ops_and_refs_with_no_numpy_ref,
TestCommon,
TestMathBits,
)

# Tests that the cpu and gpu results are consistent
# We add the logs for the test results to help debug, will remove them after the test is stable
@onlyXPU
@suppress_warnings
@slowTest
@ops(_ops_and_refs_with_no_numpy_ref, dtypes=OpDTypes.any_common_cpu_cuda_one)
def _compare_cpu(self, device, dtype, op):
def to_cpu(arg):
if isinstance(arg, torch.Tensor):
return arg.to(device="cpu")
return arg
samples = op.reference_inputs(device, dtype)
for sample in samples:
cpu_sample = sample.transform(to_cpu)
cuda_results = op(sample.input, *sample.args, **sample.kwargs)
cpu_results = op(cpu_sample.input, *cpu_sample.args, **cpu_sample.kwargs)
# output_process_fn_grad has a very unfortunate name
# We use this function in linalg extensively to postprocess the inputs of functions
# that are not completely well-defined. Think svd and muliplying the singular vectors by -1.
# CPU and CUDA implementations of the SVD can return valid SVDs that are different.
# We use this function to compare them.
cuda_results = sample.output_process_fn_grad(cuda_results)
cpu_results = cpu_sample.output_process_fn_grad(cpu_results)
# Lower tolerance because we are running this as a `@slowTest`
# Don't want the periodic tests to fail frequently
try:
self.assertEqual(cuda_results, cpu_results, atol=1e-3, rtol=1e-3)
except AssertionError as e:
raise AssertionError(f"Failed with {sample.input}, {e} \
\nthe results are {cuda_results} \nthe expect results are {cpu_results}.")

# We add the logs for the test results to help debug, will remove them after the test is stable
def _ref_test_helper(
self,
ctx,
device,
dtype,
op,
skip_zero_numel=False,
skip_zero_dim=False,
skip_bfloat=False,
skip_view_consistency=False,
):
# NOTE: this test works by comparing the reference
ex = None
for sample in op.reference_inputs(device, dtype, requires_grad=False):
if (
isinstance(sample.input, torch.Tensor)
and sample.input.numel() == 0
and skip_zero_numel
):
continue
if (
isinstance(sample.input, torch.Tensor)
and sample.input.ndim == 0
and skip_zero_dim
):
continue

if skip_bfloat and (
(
isinstance(sample.input, torch.Tensor)
and sample.input.dtype == torch.bfloat16
)
or any(
isinstance(arg, torch.Tensor) and arg.dtype == torch.bfloat16
for arg in sample.args
)
):
continue
with ctx():
ref_result = op(sample.input, *sample.args, **sample.kwargs)
torch_result = op.torch_opinfo(sample.input, *sample.args, **sample.kwargs)

for a, b in zip(
pytree.tree_leaves(ref_result), pytree.tree_leaves(torch_result)
):
if isinstance(a, torch.Tensor) or isinstance(b, torch.Tensor):
prims.utils.compare_tensor_meta(a, b)
if (
getattr(op, "validate_view_consistency", True)
and not skip_view_consistency
):
msg = (
f"The torch implementation {'returns' if b._is_view() else 'does not return'} "
f"a view, while the reference {'does' if a._is_view() else 'does not'}"
)
try:
self.assertEqual(a._is_view(), b._is_view(), msg)
except AssertionError as e:
raise AssertionError(f"Failed with {sample.input}, {e} \
\nthe results are {b} \nthe expect results are {a}.")

# Computes the dtype the more precise computatino would occur in
precise_dtype = torch.bool
if prims.utils.is_integer_dtype(dtype):
# Note: bool and integer dtypes do not have more
# precise dtypes -- they simply must be close
precise_dtype = dtype
if prims.utils.is_float_dtype(dtype):
precise_dtype = torch.double
if prims.utils.is_complex_dtype(dtype):
precise_dtype = torch.cdouble

# Checks if the results are close
try:
self.assertEqual(
ref_result,
torch_result,
exact_stride=False,
exact_device=True,
exact_layout=True,
exact_is_coalesced=True,
)
except AssertionError as e:
# Raises the error if the precise dtype comparison wouldn't be
# different
if dtype is precise_dtype:
raise AssertionError(f"Failed with {sample.input}, {e} \
\nthe results are {torch_result} \nthe expect results are {ref_result}.")

ex = e

# Goes to next sample if these results are close
if not ex:
continue

# If the results are not close, checks that the
# reference is more accurate than the torch op
def _make_precise(x):
if isinstance(x, torch.dtype):
return precise_dtype
if isinstance(x, torch.Tensor) and x.dtype is dtype:
return x.to(precise_dtype)
return x

precise_sample = sample.transform(_make_precise)
precise_result = op.torch_opinfo(
precise_sample.input, *precise_sample.args, **precise_sample.kwargs
)

def _distance(a, b):
# Special-cases boolean comparisons
if prims.utils.is_boolean_dtype(a.dtype):
assert b.dtype is torch.bool
return (a ^ b).sum()

same = a == b
if prims.utils.is_float_dtype(a.dtype) or prims.utils.is_complex_dtype(
a.dtype
):
same = torch.logical_or(
same, torch.logical_and(torch.isnan(a), torch.isnan(b))
)

actual_error = torch.where(same, 0, torch.abs(a - b)).sum()
return actual_error

ref_distance = 0
for a, b in zip(
pytree.tree_leaves(ref_result), pytree.tree_leaves(precise_result)
):
ref_distance = ref_distance + _distance(a, b)

torch_distance = 0
for a, b in zip(
pytree.tree_leaves(torch_result), pytree.tree_leaves(precise_result)
):
torch_distance = torch_distance + _distance(a, b)

# TODO: consider adding some tolerance to this comparison
msg = (
f"Reference result was farther ({ref_distance}) from the precise "
f"computation than the torch result was ({torch_distance})!"
)
try:
self.assertTrue(ref_distance <= torch_distance, msg=msg)
except AssertionError as e:
raise AssertionError(f"Failed with {sample.input}, {e} \
\nthe results are {torch_result} \nthe expect results are {precise_result}.")

# Reports numerical accuracy discrepancies
if ex is not None:
msg = "Test passed because the reference was more accurate than the torch operator."
warnings.warn(msg)

TestCommon.test_compare_cpu = _compare_cpu
TestCommon._ref_test_helper = _ref_test_helper

instantiate_device_type_tests(TestCommon, globals(), only_for="xpu", allow_xpu=True)
instantiate_device_type_tests(TestMathBits, globals(), only_for="xpu", allow_xpu=True)
# in finegrand
Expand Down
Loading