part-4 cherry from: Add auto growth allocator for CUDA pinned allocat…

…or (PaddlePaddle#57625) * fix h2d bandwidth * remove useless flags
hitywt · Nov 28, 2023 · 35d8e6f · 35d8e6f
1 parent c5c41cc
commit 35d8e6f
Show file tree

Hide file tree

Showing 5 changed files with 65 additions and 4 deletions.
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -97,6 +97,7 @@ PADDLE_DEFINE_EXPORTED_bool(use_cuda_managed_memory,
 
 PHI_DECLARE_string(allocator_strategy);
 PHI_DECLARE_uint64(auto_growth_chunk_size_in_mb);
+PHI_DECLARE_bool(use_auto_growth_pinned_allocator);
 
 namespace paddle {
 namespace memory {
@@ -720,8 +721,21 @@ class AllocatorFacadePrivate {
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   void InitNaiveBestFitCUDAPinnedAllocator() {
-    allocators_[platform::CUDAPinnedPlace()] =
-        std::make_shared<NaiveBestFitAllocator>(platform::CUDAPinnedPlace());
+    if (FLAGS_use_auto_growth_pinned_allocator) {
+      auto chunk_size = FLAGS_auto_growth_chunk_size_in_mb << 20;
+      VLOG(4) << "FLAGS_auto_growth_chunk_size_in_mb is "
+              << FLAGS_auto_growth_chunk_size_in_mb;
+      auto pinned_allocator = std::make_shared<CPUPinnedAllocator>();
+      allocators_[platform::CUDAPinnedPlace()] =
+          std::make_shared<AutoGrowthBestFitAllocator>(
+              pinned_allocator,
+              phi::backends::cpu::CUDAPinnedMinChunkSize(),
+              chunk_size,
+              allow_free_idle_chunk_);
+    } else {
+      allocators_[platform::CUDAPinnedPlace()] =
+          std::make_shared<NaiveBestFitAllocator>(platform::CUDAPinnedPlace());
+    }
   }
 
   void InitNaiveBestFitCUDAAllocator(platform::CUDAPlace p) {

diff --git a/paddle/fluid/memory/allocation/pinned_allocator.cc b/paddle/fluid/memory/allocation/pinned_allocator.cc
@@ -26,6 +26,7 @@ void CPUPinnedAllocator::FreeImpl(phi::Allocation *allocation) {
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(cudaFreeHost(allocation->ptr()));
 #endif
+  VLOG(10) << "cudaFreeHost " << allocation->ptr();
   HOST_MEMORY_STAT_UPDATE(Reserved, 0, -allocation->size());
   platform::RecordMemEvent(allocation->ptr(),
                            allocation->place(),
@@ -40,6 +41,7 @@ phi::Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) {
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(cudaHostAlloc(&ptr, size, cudaHostAllocPortable));
 #endif
+  VLOG(10) << "cudaHostAlloc " << size << " " << ptr;
   HOST_MEMORY_STAT_UPDATE(Reserved, 0, size);
   platform::RecordMemEvent(ptr,
                            platform::CUDAPinnedPlace(),

diff --git a/paddle/phi/core/flags.cc b/paddle/phi/core/flags.cc
@@ -1391,3 +1391,8 @@ PHI_DEFINE_EXPORTED_bool(enable_async_trace,
                          "enable collective async trace");
 
 PHI_DEFINE_EXPORTED_int32(async_trace_count, 5, "collective async trace count");
+
+PHI_DEFINE_EXPORTED_bool(
+    use_auto_growth_pinned_allocator,
+    false,
+    "Whether to use the auto_growth CUDA pinned allocator.");
diff --git a/python/paddle/base/dygraph/tensor_patch_methods.py b/python/paddle/base/dygraph/tensor_patch_methods.py
@@ -967,11 +967,11 @@ def cuda(self, device_id=None, blocking=True):
             return res
 
     @framework.dygraph_only
-    def pin_memory(self):
+    def pin_memory(self, blocking=True):
         if self.place.is_cuda_pinned_place():
             return self
         else:
-            res = self._copy_to(core.CUDAPinnedPlace(), True)
+            res = self._copy_to(core.CUDAPinnedPlace(), blocking)
             res.stop_gradient = self.stop_gradient
             res.persistable = self.persistable
             return res

diff --git a/test/legacy_test/test_auto_growth_pinned_allocator.py b/test/legacy_test/test_auto_growth_pinned_allocator.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+
+
+class TestPinnedAllocator(unittest.TestCase):
+    def test_main(self):
+        if not paddle.is_compiled_with_cuda():
+            return
+
+        paddle.set_flags({'FLAGS_use_auto_growth_pinned_allocator': True})
+        x_np = np.random.random([1024, 1024, 4]).astype(np.float32)
+        x_pd_gpu = paddle.to_tensor(x_np)
+
+        x_pd_pin = x_pd_gpu.pin_memory(False)
+        paddle.device.cuda.synchronize()
+        np.testing.assert_equal(x_np, x_pd_pin.numpy())
+
+        x_pd_pin = x_pd_gpu.pin_memory()
+        np.testing.assert_equal(x_np, x_pd_pin.numpy())
+
+
+if __name__ == "__main__":
+    unittest.main()