Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[dy2static]implement tensor.cuda() in static graph #56092

Merged
merged 4 commits into from
Aug 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -476,6 +476,56 @@ void ApplyDeviceGuard(const OperatorBase* op_base,
}
}

platform::DeviceContext* ConstructDeviceContext(const OperatorBase* op,
const platform::Place& place) {
auto& pool = platform::DeviceContextPool::Instance();
auto* default_dev_ctx = pool.Get(place);

// Replace the default_dev_ctx according to dst_place_type for memcpy op if
// needed.

// NOTE(liudongxue01):
// Please apply the following logic in other Executor/Interpreter modules
// likewise.
//

// NOTE(liudongxue01):
// The following code aims to fixup the memcpy kernel which does not handle
// some rare case. The case is:
// 1. The default place in the current execution context is not CUDAPlace,
// such as CPUPlace,
// 2. The dst_place_type is 1 which means CUDAPlace,
// 3. The expected result place is CUDAPlace but the actual result is
// CPUPlace.
// When the default place is CPUPlace, we call the tensor.cuda() would
// simply hit such case.
//
// Q: Why we do not add such logic in the memcpy kernel?
// A: (1) To fixup the memcpy kernel, we need to construct a CUDAPlace() and
// corresponding DeviceContext instance which used by the phi::Copy(...)
// api to perform the real memcpy action. (2) We should not access the
// singleton of the DeviceContextPool object in the PHI framework which
// is designed as a standalone module and all context data should passed
// into the kernel API through arguments. (3) So we have no way to
// construct a CUDAPlace() in the memcpy kernel and then pass it
// to the phi::Copy(...) api.
if (!platform::is_gpu_place(place)) {
const auto& op_type = op->Type();
if (op_type == "memcpy") {
int dst_place_type = op->Attr<int>("dst_place_type");
if (dst_place_type == 1) { // 1 : CUDAPlace
auto dev_ctx = pool.Get(paddle::DefaultGPUPlace());
VLOG(4) << "Change the device context for memcpy OP: ("
<< default_dev_ctx->type_info().name() << ") -> ("
<< dev_ctx->type_info().name() << ")";
return dev_ctx;
}
}
}

return default_dev_ctx;
}

void HandleOperatorBase(const platform::Place& place,
std::shared_ptr<OperatorBase> op,
OpFuncNode* op_func_node,
Expand Down Expand Up @@ -537,6 +587,8 @@ void BuildOpFuncList(const platform::Place& place,
}
auto unused_var_map = GetUnusedVars(block, ops);

platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();

bool flag_log_is_printed = false;
for (size_t i = 0; i < ops.size(); ++i) {
auto op = ops[i].get();
Expand Down Expand Up @@ -655,8 +707,9 @@ void BuildOpFuncList(const platform::Place& place,
runtime_scope = local_scope;
}

auto& pool = platform::DeviceContextPool::Instance();
auto* dev_ctx = pool.Get(place);
// construct the device context
auto* dev_ctx = ConstructDeviceContext(op, place);

SetDeviceCommContext(op, dev_ctx);
auto exec_ctx = ExecutionContext(
*op_with_kernel, *runtime_scope, *dev_ctx, runtime_context);
Expand Down
57 changes: 51 additions & 6 deletions python/paddle/fluid/layers/math_op_patch.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ def cpu(self):
In Static Graph Mode:

.. code-block:: python

import paddle
paddle.enable_static()

Expand All @@ -162,7 +163,7 @@ def cpu(self):
persistable=False,
stop_gradient=True,
)
# 0 means cpu place, see paddle/fluid/operators/memcpy_op.h
# 0 means cpu place, see paddle/phi/kernels/memcpy_kernel.cc
attrs = {'dst_place_type': 0}
block.append_op(
type='memcpy',
Expand All @@ -173,13 +174,57 @@ def cpu(self):
return output

@static_only
def cuda(self):
def cuda(self, device_id=None, blocking=True):
"""
Variable should not have cpu() and cuda() interface.
But this interface can greatly facilitate dy2static.
We do nothing here.
In dy2static, Variable also needs cpu() and cuda() interface.
But, the underneath operator has only forward op but not backward one.

Args:
self(Variable): The variable itself.
device_id(int, optional): The destination GPU device id. Default: None, means current device.
We add this argument for dy2static translation, please do not use it.
blocking(bool, optional): Whether blocking or not, Default: True.
We add this argument for dy2static translation, please do not use it.

Returns:
The tensor which has copied to cuda place.

Examples:
In Static Graph Mode:

.. code-block:: python

import paddle
paddle.enable_static()

x = paddle.static.data(name="x", shape=[2,2], dtype='float32')
y = x.cpu()
z = y.cuda()
"""
return self
if device_id is not None:
warnings.warn("device_id is not supported, and it will be ignored.")
if blocking is not True:
warnings.warn("blocking is not supported, and it will be ignored.")

block = current_block(self)
tmp_name = unique_tmp_name()
output = block.create_var(
name=tmp_name,
dtype=self.dtype,
shape=self.shape,
type=self.type,
persistable=False,
stop_gradient=True,
)
# 1 means cuda place, see paddle/phi/kernels/memcpy_kernel.cc
attrs = {'dst_place_type': 1}
block.append_op(
type='memcpy',
inputs={'X': [self]},
outputs={'Out': [output]},
attrs=attrs,
)
return output

@static_only
def place(self):
Expand Down
76 changes: 76 additions & 0 deletions test/dygraph_to_static/test_tensor_memcpy_on_cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,20 @@ def tensor_copy_to_cpu(x):
return y


@paddle.jit.to_static
def tensor_copy_to_cuda(x):
x = paddle.to_tensor(x)
y = x.cuda()
return y


@paddle.jit.to_static
def tensor_copy_to_cuda_with_warning(x, device_id=None, blocking=True):
x = paddle.to_tensor(x)
y = x.cuda(device_id, blocking)
return y


class TestTensorCopyToCpuOnDefaultCPU(unittest.TestCase):
def _run(self, to_static):
paddle.jit.enable_to_static(to_static)
Expand All @@ -46,5 +60,67 @@ def test_tensor_cpu_on_default_cpu(self):
self.assertTrue(static_place.is_cpu_place())


class TestTensorCopyToCUDAOnDefaultCPU(unittest.TestCase):
def _run(self, to_static):
paddle.jit.enable_to_static(to_static)
x1 = paddle.ones([1, 2, 3])
x2 = tensor_copy_to_cuda(x1)
return x1.place, x2.place, x2.numpy()

def test_tensor_cuda_on_default_cpu(self):
if not paddle.fluid.is_compiled_with_cuda():
return

"""
Note(liudongxue01): If the following asserts fail to run,
please check the workaround logic for memcpy OP
whether is still taking effect or not.
See ConstructDeviceContext() in interpreter_util.cc.
"""
paddle.fluid.framework._set_expected_place(paddle.CPUPlace())
dygraph_x1_place, dygraph_place, dygraph_res = self._run(
to_static=False
)
static_x1_place, static_place, static_res = self._run(to_static=True)
np.testing.assert_allclose(dygraph_res, static_res, rtol=1e-05)
self.assertTrue(dygraph_x1_place.is_cpu_place())
self.assertTrue(static_x1_place.is_cpu_place())
self.assertTrue(dygraph_place.is_gpu_place())
self.assertTrue(static_place.is_gpu_place())


class TestTensorCopyToCUDAWithWarningOnCPU(unittest.TestCase):
def _run(self, to_static):
paddle.jit.enable_to_static(to_static)
x1 = paddle.ones([1, 2, 3])
x2 = tensor_copy_to_cuda_with_warning(x1, device_id=1, blocking=False)
return x1.place, x2.place, x2.numpy()

def test_with_warning_on_cpu(self):
if not paddle.fluid.is_compiled_with_cuda():
return

paddle.fluid.framework._set_expected_place(paddle.CPUPlace())

x1 = paddle.ones([1, 2, 3])
with self.assertWarns(UserWarning, msg="ignored") as cm:
x2 = tensor_copy_to_cuda_with_warning(
x1, device_id=1, blocking=True
)
self.assertIn('math_op_patch.py', cm.filename)

with self.assertWarns(UserWarning, msg="ignored") as cm:
x2 = tensor_copy_to_cuda_with_warning(
x1, device_id=None, blocking=False
)
self.assertIn('math_op_patch.py', cm.filename)

with self.assertWarns(UserWarning, msg="ignored") as cm:
x2 = tensor_copy_to_cuda_with_warning(
x1, device_id=2, blocking=False
)
self.assertIn('math_op_patch.py', cm.filename)


if __name__ == '__main__':
unittest.main()
76 changes: 76 additions & 0 deletions test/dygraph_to_static/test_tensor_memcpy_on_gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,20 @@ def tensor_copy_to_cpu(x):
return y


@paddle.jit.to_static
def tensor_copy_to_cuda(x):
x = paddle.to_tensor(x)
y = x.cuda()
return y


@paddle.jit.to_static
def tensor_copy_to_cuda_with_warning(x, device_id=None, blocking=True):
x = paddle.to_tensor(x)
y = x.cuda(device_id, blocking)
return y


class TestTensorCopyToCpuOnDefaultGPU(unittest.TestCase):
def _run(self, to_static):
paddle.jit.enable_to_static(to_static)
Expand All @@ -53,5 +67,67 @@ def test_tensor_cpu_on_default_gpu(self):
self.assertTrue(static_place.is_cpu_place())


class TestTensorCopyToCUDAOnDefaultGPU(unittest.TestCase):
def _run(self, to_static):
paddle.jit.enable_to_static(to_static)
x1 = paddle.ones([1, 2, 3])
x2 = tensor_copy_to_cuda(x1)
return x1.place, x2.place, x2.numpy()

def test_tensor_cuda_on_default_gpu(self):
if paddle.fluid.is_compiled_with_cuda():
place = paddle.CUDAPlace(
int(os.environ.get('FLAGS_selected_gpus', 0))
)
else:
return
paddle.fluid.framework._set_expected_place(place)
dygraph_x1_place, dygraph_place, dygraph_res = self._run(
to_static=False
)
static_x1_place, static_place, static_res = self._run(to_static=True)
np.testing.assert_allclose(dygraph_res, static_res, rtol=1e-05)
self.assertTrue(dygraph_x1_place.is_gpu_place())
self.assertTrue(static_x1_place.is_gpu_place())
self.assertTrue(dygraph_place.is_gpu_place())
self.assertTrue(static_place.is_gpu_place())


class TestTensorCopyToCUDAWithWarningOnGPU(unittest.TestCase):
def _run(self, to_static):
paddle.jit.enable_to_static(to_static)
x1 = paddle.ones([1, 2, 3])
x2 = tensor_copy_to_cuda_with_warning(x1, device_id=1, blocking=False)
return x1.place, x2.place, x2.numpy()

def test_with_warning_on_gpu(self):
if paddle.fluid.is_compiled_with_cuda():
place = paddle.CUDAPlace(
int(os.environ.get('FLAGS_selected_gpus', 0))
)
else:
return
paddle.fluid.framework._set_expected_place(place)

x1 = paddle.ones([1, 2, 3])
with self.assertWarns(UserWarning, msg="ignored") as cm:
x2 = tensor_copy_to_cuda_with_warning(
x1, device_id=1, blocking=True
)
self.assertIn('math_op_patch.py', cm.filename)

with self.assertWarns(UserWarning, msg="ignored") as cm:
x2 = tensor_copy_to_cuda_with_warning(
x1, device_id=None, blocking=False
)
self.assertIn('math_op_patch.py', cm.filename)

with self.assertWarns(UserWarning, msg="ignored") as cm:
x2 = tensor_copy_to_cuda_with_warning(
x1, device_id=2, blocking=False
)
self.assertIn('math_op_patch.py', cm.filename)


if __name__ == '__main__':
unittest.main()