Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CustomDevice] add stream safe allocator support #55393

Merged
merged 1 commit into from
Jul 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion paddle/fluid/memory/allocation/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@ if(UNIX AND NOT APPLE)
endif()

if(WITH_CUSTOM_DEVICE)
list(APPEND ALLOCATOR_SRCS custom_allocator.cc)
list(APPEND ALLOCATOR_SRCS custom_allocator.cc
stream_safe_custom_device_allocator.cc)
endif()

if(WITH_XPU)
Expand Down
127 changes: 123 additions & 4 deletions paddle/fluid/memory/allocation/allocator_facade.cc
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,9 @@
#endif

#ifdef PADDLE_WITH_CUSTOM_DEVICE
#include "paddle/fluid/memory/allocation/custom_allocator.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/memory/allocation/stream_safe_custom_device_allocator.h"
#endif

#include "paddle/fluid/platform/flags.h"

PADDLE_DEFINE_EXPORTED_int64(
Expand Down Expand Up @@ -174,6 +174,11 @@ class AllocatorFacadePrivate {
std::map<platform::XPUPlace,
std::map<XPUStream, std::shared_ptr<Allocator>>>;
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
using CustomDeviceAllocatorMap =
std::map<platform::CustomPlace,
std::map<phi::stream::stream_t, std::shared_ptr<Allocator>>>;
#endif

explicit AllocatorFacadePrivate(bool allow_free_idle_chunk = true) {
strategy_ = GetAllocatorStrategy();
Expand Down Expand Up @@ -564,6 +569,46 @@ class AllocatorFacadePrivate {
}
#endif

#ifdef PADDLE_WITH_CUSTOM_DEVICE
bool HasCustomDevice(const platform::CustomPlace& place,
phi::stream::stream_t stream) {
auto it = custom_device_allocators_.find(place);
if (it == custom_device_allocators_.end()) {
return false;
}
auto& allocator_map = it->second;
return allocator_map.find(stream) != allocator_map.end();
}

const std::shared_ptr<Allocator>& GetAllocator(
const platform::CustomPlace& place,
phi::stream::stream_t stream,
bool create_if_not_found = false) {
/* shared_lock_guard */ {
std::shared_lock<std::shared_timed_mutex> lock_guard(
custom_device_allocator_mutex_);
if (LIKELY(HasCustomDevice(place, stream))) {
return custom_device_allocators_[place][stream];
} else {
PADDLE_ENFORCE_NE(create_if_not_found,
false,
platform::errors::NotFound(
"No allocator found for stream %s in place %s "
"with create_if_not_found = false",
stream,
place));
}
}

/* unique_lock_guard */ {
std::unique_lock<std::shared_timed_mutex> lock_guard(
custom_device_allocator_mutex_);
InitStreamSafeCustomDeviceAllocator(place, stream);
return custom_device_allocators_[place][stream];
}
}
#endif

private:
class ZeroSizeAllocator : public Allocator {
public:
Expand Down Expand Up @@ -1008,9 +1053,17 @@ class AllocatorFacadePrivate {
allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
}

void InitNaiveBestFitCustomDeviceAllocator(platform::CustomPlace p,
phi::stream::stream_t stream) {
custom_device_allocators_[p][stream] =
std::make_shared<NaiveBestFitAllocator>(p);
}

void InitAutoGrowthCustomDeviceAllocator(platform::CustomPlace p,
bool allow_free_idle_chunk) {
auto chunk_size = FLAGS_auto_growth_chunk_size_in_mb << 20;
VLOG(4) << "FLAGS_auto_growth_chunk_size_in_mb is "
<< FLAGS_auto_growth_chunk_size_in_mb;
auto custom_allocator =
std::make_shared<paddle::memory::allocation::CustomAllocator>(p);
allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
Expand All @@ -1019,6 +1072,40 @@ class AllocatorFacadePrivate {
/*chunk_size=*/chunk_size,
allow_free_idle_chunk);
}

void InitAutoGrowthCustomDeviceAllocator(platform::CustomPlace p,
phi::stream::stream_t stream) {
auto chunk_size = FLAGS_auto_growth_chunk_size_in_mb << 20;
VLOG(4) << "FLAGS_auto_growth_chunk_size_in_mb is "
<< FLAGS_auto_growth_chunk_size_in_mb;

auto custom_allocator =
std::make_shared<paddle::memory::allocation::CustomAllocator>(p);
auto alignment = phi::DeviceManager::GetMinChunkSize(p);
custom_device_allocators_[p][stream] =
std::make_shared<AutoGrowthBestFitAllocator>(
custom_allocator, alignment, chunk_size, allow_free_idle_chunk_);
}

void WrapStreamSafeCustomDeviceAllocator(platform::CustomPlace p,
phi::stream::stream_t stream) {
std::shared_ptr<Allocator>& allocator =
custom_device_allocators_[p][stream];
allocator =
std::make_shared<StreamSafeCustomDeviceAllocator>(allocator, p, stream);
}

void InitStreamSafeCustomDeviceAllocator(platform::CustomPlace p,
phi::stream::stream_t stream) {
VLOG(8) << "Init CustomDevice allocator for stream " << stream
<< " in place " << p;
if (strategy_ == AllocatorStrategy::kAutoGrowth) {
InitAutoGrowthCustomDeviceAllocator(p, stream);
} else {
InitNaiveBestFitCustomDeviceAllocator(p, stream);
}
WrapStreamSafeCustomDeviceAllocator(p, stream);
}
#endif

void InitSystemAllocators() {
Expand Down Expand Up @@ -1161,6 +1248,15 @@ class AllocatorFacadePrivate {
std::shared_timed_mutex xpu_allocator_mutex_;
#endif

#ifdef PADDLE_WITH_CUSTOM_DEVICE
// a standalone custom device allocator to support multi-stream GC in new
// executor
std::map<platform::Place, std::shared_ptr<StreamSafeCustomDeviceAllocator>>
default_stream_safe_custom_device_allocators_;
CustomDeviceAllocatorMap custom_device_allocators_;
std::shared_timed_mutex custom_device_allocator_mutex_;
#endif

AllocatorStrategy strategy_;
AllocatorMap allocators_;
static AllocatorMap zero_size_allocators_;
Expand Down Expand Up @@ -1252,6 +1348,16 @@ std::shared_ptr<phi::Allocation> AllocatorFacade::AllocShared(
AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
size_t size,
const phi::Stream& stream) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
if (platform::is_custom_place(place)) {
platform::CustomPlace p(place);
phi::stream::stream_t s =
reinterpret_cast<phi::stream::stream_t>(stream.id());
return GetPrivate()
->GetAllocator(p, s, /* create_if_not_found = */ true)
->Allocate(size);
}
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
AllocatorFacadePrivate* m = GetPrivate();
if (!m->IsStreamSafeCUDAAllocatorUsed()) {
Expand All @@ -1270,8 +1376,8 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
#elif defined(PADDLE_WITH_XPU)
return GetAllocator(place)->Allocate(size);
#else
PADDLE_THROW(
platform::errors::PreconditionNotMet("Not compiled with GPU or XPU."));
PADDLE_THROW(platform::errors::PreconditionNotMet(
"Not compiled with GPU or XPU or CustomDevice."));
#endif
}

Expand Down Expand Up @@ -1376,6 +1482,19 @@ void AllocatorFacade::RemoveMemoryPoolOfCUDAGraph(int64_t id) {
#endif
#endif

#ifdef PADDLE_WITH_CUSTOM_DEVICE
const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
const platform::Place& place, phi::stream::stream_t stream) {
AllocatorFacadePrivate* m = GetPrivate();
if (!FLAGS_use_stream_safe_cuda_allocator) {
return m->GetAllocator(place,
stream,
/*create_if_not_found=*/true);
}
return m->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
}
#endif

UNUSED static std::shared_ptr<NaiveBestFitAllocator> unused_obj =
std::make_shared<NaiveBestFitAllocator>(platform::CPUPlace());

Expand Down
9 changes: 9 additions & 0 deletions paddle/fluid/memory/allocation/allocator_facade.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@
#include "paddle/fluid/platform/place.h"
#include "paddle/phi/core/stream.h"

#ifdef PADDLE_WITH_CUSTOM_DEVICE
#include "paddle/fluid/memory/allocation/custom_allocator.h"
#include "paddle/phi/backends/device_manager.h"
#endif

namespace paddle {
namespace memory {
namespace allocation {
Expand Down Expand Up @@ -91,6 +96,10 @@ class AllocatorFacade {
void RemoveMemoryPoolOfCUDAGraph(int64_t id);
#endif

#ifdef PADDLE_WITH_CUSTOM_DEVICE
const std::shared_ptr<Allocator>& GetAllocator(const platform::Place& place,
phi::stream::stream_t stream);
#endif
// TODO(yy): Allocate a Copy-On-Write allocation?
private:
AllocatorFacade();
Expand Down
Loading