Skip to content

Commit

Permalink
part-4 cherry from: Add auto growth allocator for CUDA pinned allocat…
Browse files Browse the repository at this point in the history
…or (PaddlePaddle#57625)

* fix h2d bandwidth

* remove useless flags
  • Loading branch information
sneaxiy authored and wentaoyu committed Nov 28, 2023
1 parent c5c41cc commit 35d8e6f
Show file tree
Hide file tree
Showing 5 changed files with 65 additions and 4 deletions.
18 changes: 16 additions & 2 deletions paddle/fluid/memory/allocation/allocator_facade.cc
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ PADDLE_DEFINE_EXPORTED_bool(use_cuda_managed_memory,

PHI_DECLARE_string(allocator_strategy);
PHI_DECLARE_uint64(auto_growth_chunk_size_in_mb);
PHI_DECLARE_bool(use_auto_growth_pinned_allocator);

namespace paddle {
namespace memory {
Expand Down Expand Up @@ -720,8 +721,21 @@ class AllocatorFacadePrivate {

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void InitNaiveBestFitCUDAPinnedAllocator() {
allocators_[platform::CUDAPinnedPlace()] =
std::make_shared<NaiveBestFitAllocator>(platform::CUDAPinnedPlace());
if (FLAGS_use_auto_growth_pinned_allocator) {
auto chunk_size = FLAGS_auto_growth_chunk_size_in_mb << 20;
VLOG(4) << "FLAGS_auto_growth_chunk_size_in_mb is "
<< FLAGS_auto_growth_chunk_size_in_mb;
auto pinned_allocator = std::make_shared<CPUPinnedAllocator>();
allocators_[platform::CUDAPinnedPlace()] =
std::make_shared<AutoGrowthBestFitAllocator>(
pinned_allocator,
phi::backends::cpu::CUDAPinnedMinChunkSize(),
chunk_size,
allow_free_idle_chunk_);
} else {
allocators_[platform::CUDAPinnedPlace()] =
std::make_shared<NaiveBestFitAllocator>(platform::CUDAPinnedPlace());
}
}

void InitNaiveBestFitCUDAAllocator(platform::CUDAPlace p) {
Expand Down
2 changes: 2 additions & 0 deletions paddle/fluid/memory/allocation/pinned_allocator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ void CPUPinnedAllocator::FreeImpl(phi::Allocation *allocation) {
#else
PADDLE_ENFORCE_GPU_SUCCESS(cudaFreeHost(allocation->ptr()));
#endif
VLOG(10) << "cudaFreeHost " << allocation->ptr();
HOST_MEMORY_STAT_UPDATE(Reserved, 0, -allocation->size());
platform::RecordMemEvent(allocation->ptr(),
allocation->place(),
Expand All @@ -40,6 +41,7 @@ phi::Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) {
#else
PADDLE_ENFORCE_GPU_SUCCESS(cudaHostAlloc(&ptr, size, cudaHostAllocPortable));
#endif
VLOG(10) << "cudaHostAlloc " << size << " " << ptr;
HOST_MEMORY_STAT_UPDATE(Reserved, 0, size);
platform::RecordMemEvent(ptr,
platform::CUDAPinnedPlace(),
Expand Down
5 changes: 5 additions & 0 deletions paddle/phi/core/flags.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1391,3 +1391,8 @@ PHI_DEFINE_EXPORTED_bool(enable_async_trace,
"enable collective async trace");

PHI_DEFINE_EXPORTED_int32(async_trace_count, 5, "collective async trace count");

PHI_DEFINE_EXPORTED_bool(
use_auto_growth_pinned_allocator,
false,
"Whether to use the auto_growth CUDA pinned allocator.");
4 changes: 2 additions & 2 deletions python/paddle/base/dygraph/tensor_patch_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -967,11 +967,11 @@ def cuda(self, device_id=None, blocking=True):
return res

@framework.dygraph_only
def pin_memory(self):
def pin_memory(self, blocking=True):
if self.place.is_cuda_pinned_place():
return self
else:
res = self._copy_to(core.CUDAPinnedPlace(), True)
res = self._copy_to(core.CUDAPinnedPlace(), blocking)
res.stop_gradient = self.stop_gradient
res.persistable = self.persistable
return res
Expand Down
40 changes: 40 additions & 0 deletions test/legacy_test/test_auto_growth_pinned_allocator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import unittest

import numpy as np

import paddle


class TestPinnedAllocator(unittest.TestCase):
def test_main(self):
if not paddle.is_compiled_with_cuda():
return

paddle.set_flags({'FLAGS_use_auto_growth_pinned_allocator': True})
x_np = np.random.random([1024, 1024, 4]).astype(np.float32)
x_pd_gpu = paddle.to_tensor(x_np)

x_pd_pin = x_pd_gpu.pin_memory(False)
paddle.device.cuda.synchronize()
np.testing.assert_equal(x_np, x_pd_pin.numpy())

x_pd_pin = x_pd_gpu.pin_memory()
np.testing.assert_equal(x_np, x_pd_pin.numpy())


if __name__ == "__main__":
unittest.main()

0 comments on commit 35d8e6f

Please sign in to comment.