Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Base modifications to make BP5 GPU aware #3000

Merged
merged 6 commits into from
Jan 12, 2022
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 13 additions & 17 deletions examples/cuda/cudaWriteRead.cu
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ int BPWrite(const std::string fname, const size_t N, int nSteps){
// Set up the ADIOS structures
adios2::ADIOS adios;
adios2::IO io = adios.DeclareIO("WriteIO");
io.SetEngine("BP5");

// Declare an array for the ADIOS data of size (NumOfProcesses * N)
const adios2::Dims shape{static_cast<size_t>(N)};
Expand Down Expand Up @@ -61,28 +62,23 @@ int BPRead(const std::string fname, const size_t N, int nSteps){
// Create ADIOS structures
adios2::ADIOS adios;
adios2::IO io = adios.DeclareIO("ReadIO");
io.SetEngine("BP5");

adios2::Engine bpReader = io.Open(fname, adios2::Mode::Read);

auto data = io.InquireVariable<float>("data");
std::cout << "Steps expected by the reader: " << bpReader.Steps() << std::endl;
std::cout << "Expecting data per step: " << data.Shape()[0];
std::cout << " elements" << std::endl;

int write_step = bpReader.Steps();
// Create the local buffer and initialize the access point in the ADIOS file
std::vector<float> simData(N); //set size to N
const adios2::Dims start{0};
const adios2::Dims count{N};
const adios2::Box<adios2::Dims> sel(start, count);
data.SetSelection(sel);

// Read the data in each of the ADIOS steps
for (size_t step = 0; step < write_step; step++)
unsigned int step = 0;
for (; bpReader.BeginStep() == adios2::StepStatus::OK; ++step)
{
data.SetStepSelection({step, 1});
auto data = io.InquireVariable<float>("data");
// Create the local buffer and initialize the access point in the ADIOS file
std::vector<float> simData(N); //set size to N
const adios2::Dims start{0};
const adios2::Dims count{N};
const adios2::Box<adios2::Dims> sel(start, count);
data.SetSelection(sel);

bpReader.Get(data, simData.data());
bpReader.PerformGets();
bpReader.EndStep();
std::cout << "Simualation step " << step << " : ";
std::cout << simData.size() << " elements: " << simData[1] << std::endl;
}
Expand Down
4 changes: 4 additions & 0 deletions source/adios2/engine/bp5/BP5Writer.tcc
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ void BP5Writer::PutCommon(Variable<T> &variable, const T *values, bool sync)
BeginStep(StepMode::Update);
}
variable.SetData(values);
// if the user buffer is allocated on the GPU always use sync mode
bool isCudaBuffer = (variable.m_MemorySpace == MemorySpace::CUDA);
if (isCudaBuffer)
sync = true;

size_t *Shape = NULL;
size_t *Start = NULL;
Expand Down
3 changes: 3 additions & 0 deletions source/adios2/helper/adiosMemory.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ void InsertToBuffer(std::vector<char> &buffer, const T *source,
template <class T>
void CopyFromGPUToBuffer(std::vector<char> &buffer, size_t &position,
const T *source, const size_t elements = 1) noexcept;
template <class T>
void CudaMemCopyToBuffer(char *buffer, size_t position,
const T *source, const size_t size) noexcept;

/**
* Wrapper around cudaMemcpy needed for isolating CUDA interface dependency
Expand Down
11 changes: 9 additions & 2 deletions source/adios2/helper/adiosMemory.inl
Original file line number Diff line number Diff line change
Expand Up @@ -79,10 +79,17 @@ template <class T>
void CopyFromGPUToBuffer(std::vector<char> &buffer, size_t &position,
const T *source, const size_t elements) noexcept
{
const char *src = reinterpret_cast<const char *>(source);
MemcpyGPUToBuffer(buffer.data() + position, src, elements * sizeof(T));
CudaMemCopyToBuffer(buffer.data(), position, source, elements * sizeof(T));
position += elements * sizeof(T);
}

template <class T>
void CudaMemCopyToBuffer(char *buffer, size_t position,
const T *source, const size_t size) noexcept
{
const char *src = reinterpret_cast<const char *>(source);
MemcpyGPUToBuffer(buffer + position, src, size);
}
#endif

template <class T>
Expand Down
20 changes: 16 additions & 4 deletions source/adios2/toolkit/format/bp5/BP5Serializer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -570,15 +570,26 @@ void BP5Serializer::DumpDeferredBlocks(bool forceCopyDeferred)
}

static void GetMinMax(const void *Data, size_t ElemCount, const DataType Type,
core::Engine::MinMaxStruct &MinMax)
core::Engine::MinMaxStruct &MinMax, MemorySpace MemSpace)
{

MinMax.Init(Type);
if (ElemCount == 0)
return;
if (Type == DataType::Compound)
{
}
#ifdef ADIOS2_HAVE_CUDA
#define pertype(T, N) \
else if (MemSpace == MemorySpace::CUDA && Type == helper::GetDataType<T>())\
{ \
const size_t size = ElemCount * sizeof(T); \
const T *values = (const T *)Data; \
helper::CUDAMinMax(values, ElemCount, MinMax.MinUnion.field_##N, \
MinMax.MaxUnion.field_##N); \
}
ADIOS2_FOREACH_MINMAX_STDTYPE_2ARGS(pertype)
#undef pertype
#endif
#define pertype(T, N) \
else if (Type == helper::GetDataType<T>()) \
{ \
Expand Down Expand Up @@ -669,7 +680,8 @@ void BP5Serializer::Marshal(void *Variable, const char *Name,
MinMax.Init(Type);
if ((m_StatsLevel > 0) && !Span)
{
GetMinMax(Data, ElemCount, (DataType)Rec->Type, MinMax);
GetMinMax(Data, ElemCount, (DataType)Rec->Type, MinMax,
VB->m_MemorySpace);
}

if (Rec->OperatorType)
Expand Down Expand Up @@ -700,7 +712,7 @@ void BP5Serializer::Marshal(void *Variable, const char *Name,
{
DataOffset = m_PriorDataBufferSizeTotal +
CurDataBuffer->AddToVec(ElemCount * ElemSize, Data,
ElemSize, Sync);
ElemSize, Sync, VB->m_MemorySpace);
}
}
else
Expand Down
2 changes: 1 addition & 1 deletion source/adios2/toolkit/format/buffer/BufferV.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ class BufferV
virtual void Reset();

virtual size_t AddToVec(const size_t size, const void *buf, size_t align,
bool CopyReqd) = 0;
bool CopyReqd, MemorySpace MemSpace=MemorySpace::Host) = 0;

struct BufferPos
{
Expand Down
19 changes: 16 additions & 3 deletions source/adios2/toolkit/format/buffer/chunk/ChunkV.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#include "ChunkV.h"
#include "adios2/toolkit/format/buffer/BufferV.h"
#include "adios2/helper/adiosFunctions.h"

#include <algorithm>
#include <assert.h>
Expand Down Expand Up @@ -82,7 +83,7 @@ void ChunkV::CopyExternalToInternal()
}

size_t ChunkV::AddToVec(const size_t size, const void *buf, size_t align,
bool CopyReqd)
bool CopyReqd, MemorySpace MemSpace)
{
if (size == 0)
{
Expand Down Expand Up @@ -120,7 +121,7 @@ size_t ChunkV::AddToVec(const size_t size, const void *buf, size_t align,
if (AppendPossible)
{
// We can use current chunk, just append the data;
memcpy(m_TailChunk + m_TailChunkPos, buf, size);
CopyDataToBuffer(size, buf, m_TailChunkPos, MemSpace);
DataV.back().Size += size;
m_TailChunkPos += size;
}
Expand All @@ -132,7 +133,7 @@ size_t ChunkV::AddToVec(const size_t size, const void *buf, size_t align,
NewSize = size;
m_TailChunk = (char *)malloc(NewSize);
m_Chunks.push_back(m_TailChunk);
memcpy(m_TailChunk, buf, size);
CopyDataToBuffer(size, buf, 0, MemSpace);
m_TailChunkPos = size;
VecEntry entry = {false, m_TailChunk, 0, size};
DataV.push_back(entry);
Expand All @@ -142,6 +143,18 @@ size_t ChunkV::AddToVec(const size_t size, const void *buf, size_t align,
return retOffset;
}

void ChunkV::CopyDataToBuffer(const size_t size, const void *buf, size_t pos,
MemorySpace MemSpace){
#ifdef ADIOS2_HAVE_CUDA
if(MemSpace == MemorySpace::CUDA)
{
helper::CudaMemCopyToBuffer(m_TailChunk, pos, buf, size);
return;
}
#endif
memcpy(m_TailChunk + pos, buf, size);
}

BufferV::BufferPos ChunkV::Allocate(const size_t size, size_t align)
{
if (size == 0)
Expand Down
4 changes: 3 additions & 1 deletion source/adios2/toolkit/format/buffer/chunk/ChunkV.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,16 @@ class ChunkV : public BufferV
virtual std::vector<core::iovec> DataVec() noexcept;

virtual size_t AddToVec(const size_t size, const void *buf, size_t align,
bool CopyReqd);
bool CopyReqd, MemorySpace MemSpace=MemorySpace::Host);

virtual BufferPos Allocate(const size_t size, size_t align);
virtual void DownsizeLastAlloc(const size_t oldSize, const size_t newSize);

virtual void *GetPtr(int bufferIdx, size_t posInBuffer);

void CopyExternalToInternal();
void CopyDataToBuffer(const size_t size, const void *buf, size_t pos,
MemorySpace MemSpace);

private:
std::vector<char *> m_Chunks;
Expand Down
2 changes: 1 addition & 1 deletion source/adios2/toolkit/format/buffer/malloc/MallocV.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ void MallocV::CopyExternalToInternal()
}

size_t MallocV::AddToVec(const size_t size, const void *buf, size_t align,
bool CopyReqd)
bool CopyReqd, MemorySpace MemSpace)
{
if (size == 0)
{
Expand Down
2 changes: 1 addition & 1 deletion source/adios2/toolkit/format/buffer/malloc/MallocV.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ class MallocV : public BufferV
virtual void Reset();

virtual size_t AddToVec(const size_t size, const void *buf, size_t align,
bool CopyReqd);
bool CopyReqd, MemorySpace MemSpace=MemorySpace::Host);

virtual BufferPos Allocate(const size_t size, size_t align);
void DownsizeLastAlloc(const size_t oldSize, const size_t newSize);
Expand Down