Skip to content

Commit

Permalink
Optimize sequence type usage on CUDA [3/n] (#9002)
Browse files Browse the repository at this point in the history
  • Loading branch information
hariharans29 authored Sep 8, 2021
1 parent 2e8792c commit c20cb76
Show file tree
Hide file tree
Showing 6 changed files with 48 additions and 47 deletions.
4 changes: 4 additions & 0 deletions onnxruntime/core/framework/TensorSeq.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,10 @@ class TensorSeq {
tensors_.push_back(std::move(tensor));
}

void Reserve(size_t capacity) {
tensors_.reserve(capacity);
}

private:
// A sequence must be associated with only one data type and all tensors in the seq must be of that type
// One other alternative of storing the data type of a seq is to templatize the TensorSeq class.
Expand Down
20 changes: 6 additions & 14 deletions onnxruntime/core/providers/cpu/sequence/sequence_ops.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,8 @@ ONNX_CPU_OPERATOR_KERNEL(

Status SequenceLength::Compute(OpKernelContext* context) const {
const auto* X = context->Input<TensorSeq>(0);
ORT_ENFORCE(X != nullptr, "Got nullptr for sequence input.");

auto* Y = context->Output(0, {});
ORT_ENFORCE(Y != nullptr, "SequenceLength: Got nullptr for output tensor");
auto* Y_data = Y->template MutableData<int64_t>();
*Y_data = static_cast<int64_t>(X->Size());

Expand Down Expand Up @@ -83,10 +81,9 @@ bool ValidateSeqIdx(int64_t input_seq_idx, int64_t seq_size) {

Status SequenceAt::Compute(OpKernelContext* context) const {
const auto* X = context->Input<TensorSeq>(0);
ORT_ENFORCE(X != nullptr, "Got nullptr for sequence input.");

const auto* I = context->Input<Tensor>(1);
ORT_ENFORCE(I != nullptr, "Got nullptr input for index tensor");

int64_t input_seq_idx = GetSeqIdx(*I);
if (!ValidateSeqIdx(input_seq_idx, static_cast<int64_t>(X->Size()))) {
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
Expand All @@ -98,7 +95,7 @@ Status SequenceAt::Compute(OpKernelContext* context) const {
}
const Tensor& indexed_tensor = X->Get(input_seq_idx);
auto* Y = context->Output(0, indexed_tensor.Shape().GetDims());
ORT_ENFORCE(Y != nullptr, "SequenceAt: Got nullptr for output tensor");

CopyCpuTensor(&indexed_tensor, Y);

return Status::OK();
Expand All @@ -120,7 +117,7 @@ SequenceEmpty::SequenceEmpty(const OpKernelInfo& info) : OpKernel(info) {

Status SequenceEmpty::Compute(OpKernelContext* context) const {
auto* Y = context->Output<TensorSeq>(0);
ORT_ENFORCE(Y != nullptr, "SequenceEmpty: Got nullptr for output sequence");

MLDataType seq_dtype{};
switch (dtype_) {
case ONNX_NAMESPACE::TensorProto_DataType_FLOAT:
Expand Down Expand Up @@ -195,10 +192,7 @@ Status CreateCopyAndAppendCpuTensor(const Tensor& in_tensor, OpKernelContext* co

Status SequenceInsert::Compute(OpKernelContext* context) const {
const auto* S = context->Input<TensorSeq>(0);
ORT_ENFORCE(S != nullptr, "Got nullptr for sequence input.");

const auto* X = context->Input<Tensor>(1);
ORT_ENFORCE(X != nullptr, "Got nullptr for input tensor.");

// Data type of the input tensor MUST be same as that of the input sequence
if (!S->IsSameDataType(*X)) {
Expand All @@ -223,7 +217,7 @@ Status SequenceInsert::Compute(OpKernelContext* context) const {
}

auto* Y = context->Output<TensorSeq>(0);
ORT_ENFORCE(Y != nullptr, "SequenceInsert: Got nullptr for output sequence");

std::vector<Tensor> tensors;
tensors.reserve(num_tensors_input_seq + 1);
for (int i = 0; i < num_tensors_input_seq; ++i) {
Expand Down Expand Up @@ -257,9 +251,8 @@ ONNX_CPU_OPERATOR_KERNEL(

Status SequenceErase::Compute(OpKernelContext* context) const {
const auto* S = context->Input<TensorSeq>(0);
ORT_ENFORCE(S != nullptr, "Got nullptr for sequence input.");

const auto* I = context->Input<Tensor>(1);

int64_t num_tensors_input_seq = static_cast<int64_t>(S->Size());
int64_t input_seq_idx = num_tensors_input_seq - 1; // default is erase last one
if (I) { // position is optional
Expand All @@ -275,8 +268,8 @@ Status SequenceErase::Compute(OpKernelContext* context) const {
}

auto* Y = context->Output<TensorSeq>(0);
ORT_ENFORCE(Y != nullptr, "SequenceErase: Got nullptr for output sequence");
Y->SetType(S->DataType());

std::vector<Tensor> tensors;
tensors.reserve(num_tensors_input_seq - 1);
for (int i = 0; i < num_tensors_input_seq; ++i) {
Expand All @@ -303,7 +296,6 @@ Status SequenceConstruct::Compute(OpKernelContext* context) const {
ORT_ENFORCE(num_inputs >= 1, "Must have 1 or more inputs");

auto* Y = context->Output<TensorSeq>(0);
ORT_ENFORCE(Y != nullptr, "SequenceConstruct: Got nullptr for output sequence");

MLDataType first_dtype = context->Input<Tensor>(0)->DataType();
// Before copying check if all tensors are of the same type.
Expand Down
66 changes: 34 additions & 32 deletions onnxruntime/core/providers/cuda/tensor/sequence_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ class SequenceAt final : public CudaKernel {
const void* source_addr = source_tensor.DataRaw(source_type);

Tensor* target_tensor = context->Output(0, source_tensor.Shape());
ORT_ENFORCE(target_tensor != nullptr, "SequenceAt GPU: Got nullptr for output tensor.");
void* target_addr = target_tensor->MutableDataRaw(source_type);

if (source_addr != target_addr) {
Expand All @@ -52,27 +51,33 @@ class SequenceConstruct final : public CudaKernel {
public:
SequenceConstruct(const OpKernelInfo& info) : CudaKernel(info) {}
Status ComputeInternal(OpKernelContext* context) const override {
TensorSeq* Y = context->Output<TensorSeq>(0);
auto num_inputs = Node().InputArgCount().front();
ORT_ENFORCE(num_inputs >= 1, "Must have 1 or more inputs");

MLDataType first_dtype = context->Input<Tensor>(0)->DataType();

AllocatorPtr alloc;
ORT_ENFORCE(context->GetTempSpaceAllocator(&alloc).IsOK(),
"SequenceConstruct GPU: Unable to get an allocator.");

int32_t at = 0;
const Tensor* source_tensor = nullptr;
while (nullptr != (source_tensor = context->Input<Tensor>(at++))) {
if (1 == at) {
Y->SetType(source_tensor->DataType());
}
TensorSeq* Y = context->Output<TensorSeq>(0);
Y->SetType(first_dtype);
Y->Reserve(num_inputs);

for (int input_idx = 0; input_idx < num_inputs; ++input_idx) {
const auto* source_tensor = context->Input<Tensor>(input_idx);

std::unique_ptr<Tensor> target_tensor = Tensor::Create(source_tensor->DataType(),
source_tensor->Shape(), alloc);
ORT_ENFORCE(target_tensor, "SequenceConstruct GPU: Failed to allocate new tensor.");

CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(target_tensor->MutableDataRaw(),
source_tensor->DataRaw(),
source_tensor->SizeInBytes(),
cudaMemcpyDeviceToDevice, Stream()));
Y->Add(std::move(*target_tensor)); // Add will check type consistency inside

Y->Add(std::move(*target_tensor)); // Add will check for type consistency
}

return Status::OK();
}
}; // SequenceConstruct
Expand Down Expand Up @@ -179,9 +184,11 @@ class SequenceErase final : public CudaKernel {
AllocatorPtr alloc;
ORT_ENFORCE(context->GetTempSpaceAllocator(&alloc).IsOK(),
"SequenceErase GPU: Unable to get an allocator.");

TensorSeq* Y = context->Output<TensorSeq>(0);
ORT_ENFORCE(Y != nullptr, "SequenceErase GPU: Failed to allocate output tensor sequence.");
Y->SetType(X->DataType());
Y->Reserve(X_size - 1);

for (int64_t i = 0; i < X_size; ++i) {
if (i == idx) {
continue;
Expand All @@ -190,13 +197,13 @@ class SequenceErase final : public CudaKernel {
std::unique_ptr<Tensor> target_tensor = Tensor::Create(source_tensor.DataType(),
source_tensor.Shape(), alloc);

ORT_ENFORCE(target_tensor, "SequenceErase GPU: Failed to allocate new tensor.");
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(target_tensor->MutableDataRaw(),
source_tensor.DataRaw(),
source_tensor.SizeInBytes(),
cudaMemcpyDeviceToDevice, Stream()));
Y->Add(std::move(*target_tensor));
Y->Add(std::move(*target_tensor)); // Add will check for type consistency
}

return Status::OK();
}
}; // SequenceErase
Expand All @@ -223,44 +230,39 @@ class SequenceInsert final : public CudaKernel {
ORT_ENFORCE(idx >= 0 && idx <= S_size, "SequenceInsert GPU: Invalid sequence index.");
}
const Tensor* X = context->Input<Tensor>(1);
ORT_ENFORCE(X != nullptr, "SequenceInsert GPU: Got nullptr for tensor input.");

AllocatorPtr alloc;
ORT_ENFORCE(context->GetTempSpaceAllocator(&alloc).IsOK(),
"SequenceInsert GPU: Unable to get an allocator.");

std::unique_ptr<Tensor> tensor_to_be_inserted = Tensor::Create(X->DataType(),
X->Shape(), alloc);
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(tensor_to_be_inserted->MutableDataRaw(),
X->DataRaw(), X->SizeInBytes(),
cudaMemcpyDeviceToDevice, Stream()));

TensorSeq* Y = context->Output<TensorSeq>(0);
ORT_ENFORCE(Y != nullptr, "SequenceInsert GPU: Failed to allocate output tensor sequence.");
Y->SetType(S->DataType());
Y->Reserve(S_size + 1);

for (int64_t i = 0; i < S_size; ++i) {
if (i == idx) {
std::unique_ptr<Tensor> target_tensor = Tensor::Create(X->DataType(),
X->Shape(), alloc);
ORT_ENFORCE(target_tensor, "SequenceInsert GPU: Failed to allocate new tensor.");
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(target_tensor->MutableDataRaw(),
X->DataRaw(), X->SizeInBytes(),
cudaMemcpyDeviceToDevice, Stream()));
Y->Add(std::move(*target_tensor));
Y->Add(std::move(*tensor_to_be_inserted)); // Add will check for type consistency
}
const Tensor& source_tensor = S->Get(i);
std::unique_ptr<Tensor> target_tensor = Tensor::Create(source_tensor.DataType(),
source_tensor.Shape(), alloc);
ORT_ENFORCE(target_tensor, "SequenceInsert GPU: Failed to allocate new tensor.");
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(target_tensor->MutableDataRaw(),
source_tensor.DataRaw(),
source_tensor.SizeInBytes(),
cudaMemcpyDeviceToDevice, Stream()));
Y->Add(std::move(*target_tensor)); // Add will check type consistency inside
} // for
Y->Add(std::move(*target_tensor)); // Add will check for type consistency
}

if (idx == S_size) {
std::unique_ptr<Tensor> target_tensor = Tensor::Create(X->DataType(),
X->Shape(), alloc);
ORT_ENFORCE(target_tensor, "SequenceInsert GPU: Failed to allocate new tensor.");
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(target_tensor->MutableDataRaw(),
X->DataRaw(), X->SizeInBytes(),
cudaMemcpyDeviceToDevice, Stream()));
Y->Add(std::move(*target_tensor));
Y->Add(std::move(*tensor_to_be_inserted)); // Add will check for type consistency
}

return Status::OK();
}
}; // SequenceInsert
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -750,6 +750,7 @@ struct ProviderHost {
virtual size_t TensorSeq__Size(const TensorSeq* p) noexcept = 0;
virtual const Tensor& TensorSeq__Get(const TensorSeq* p, size_t i) = 0;
virtual void TensorSeq__Add(TensorSeq* p, Tensor&& tensor) = 0;
virtual void TensorSeq__Reserve(TensorSeq* p, size_t capacity) = 0;

// AllocatorManager
virtual void AllocatorManager__InsertAllocator(AllocatorManager* p, AllocatorPtr allocator) = 0;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -951,9 +951,10 @@ struct TensorSeq final {
size_t Size() const noexcept { return g_host->TensorSeq__Size(this); }
const Tensor& Get(size_t i) const { return g_host->TensorSeq__Get(this, i); }
void Add(Tensor&& tensor) { g_host->TensorSeq__Add(this, std::move(tensor)); }
void Reserve(size_t capacity) { g_host->TensorSeq__Reserve(this, capacity); }
};

template <>
inline gsl::span<const int64_t> Tensor::DataAsSpan() const { return g_host->Tensor__DataAsSpan_int64(this); }

} // namespace onnxruntime
} // namespace onnxruntime
1 change: 1 addition & 0 deletions onnxruntime/core/session/provider_bridge_ort.cc
Original file line number Diff line number Diff line change
Expand Up @@ -839,6 +839,7 @@ struct ProviderHostImpl : ProviderHost {
size_t TensorSeq__Size(const TensorSeq* p) noexcept override { return p->Size(); }
const Tensor& TensorSeq__Get(const TensorSeq* p, size_t i) override { return p->Get(i); }
void TensorSeq__Add(TensorSeq* p, Tensor&& tensor) override { p->Add(std::move(tensor)); }
void TensorSeq__Reserve(TensorSeq* p, size_t capacity) override { p->Reserve(capacity); }

// AllocatorManager (direct)
void AllocatorManager__InsertAllocator(AllocatorManager* p, AllocatorPtr allocator) override { p->AllocatorManager::InsertAllocator(allocator); }
Expand Down

0 comments on commit c20cb76

Please sign in to comment.