Optimize sequence type usage on CUDA [3/n] (#9002)

microsoft · Sep 8, 2021 · c20cb76 · c20cb76
1 parent 2e8792c
commit c20cb76
Show file tree

Hide file tree

Showing 6 changed files with 48 additions and 47 deletions.
diff --git a/onnxruntime/core/framework/TensorSeq.h b/onnxruntime/core/framework/TensorSeq.h
@@ -68,6 +68,10 @@ class TensorSeq {
     tensors_.push_back(std::move(tensor));
   }
 
+  void Reserve(size_t capacity) {
+    tensors_.reserve(capacity);
+  }
+
  private:
   // A sequence must be associated with only one data type and all tensors in the seq must be of that type
   // One other alternative of storing the data type of a seq is to templatize the TensorSeq class.

diff --git a/onnxruntime/core/providers/cpu/sequence/sequence_ops.cc b/onnxruntime/core/providers/cpu/sequence/sequence_ops.cc
@@ -29,10 +29,8 @@ ONNX_CPU_OPERATOR_KERNEL(
 
 Status SequenceLength::Compute(OpKernelContext* context) const {
   const auto* X = context->Input<TensorSeq>(0);
-  ORT_ENFORCE(X != nullptr, "Got nullptr for sequence input.");
 
   auto* Y = context->Output(0, {});
-  ORT_ENFORCE(Y != nullptr, "SequenceLength: Got nullptr for output tensor");
   auto* Y_data = Y->template MutableData<int64_t>();
   *Y_data = static_cast<int64_t>(X->Size());
 
@@ -83,10 +81,9 @@ bool ValidateSeqIdx(int64_t input_seq_idx, int64_t seq_size) {
 
 Status SequenceAt::Compute(OpKernelContext* context) const {
   const auto* X = context->Input<TensorSeq>(0);
-  ORT_ENFORCE(X != nullptr, "Got nullptr for sequence input.");
 
   const auto* I = context->Input<Tensor>(1);
-  ORT_ENFORCE(I != nullptr, "Got nullptr input for index tensor");
+
   int64_t input_seq_idx = GetSeqIdx(*I);
   if (!ValidateSeqIdx(input_seq_idx, static_cast<int64_t>(X->Size()))) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
@@ -98,7 +95,7 @@ Status SequenceAt::Compute(OpKernelContext* context) const {
   }
   const Tensor& indexed_tensor = X->Get(input_seq_idx);
   auto* Y = context->Output(0, indexed_tensor.Shape().GetDims());
-  ORT_ENFORCE(Y != nullptr, "SequenceAt: Got nullptr for output tensor");
+
   CopyCpuTensor(&indexed_tensor, Y);
 
   return Status::OK();
@@ -120,7 +117,7 @@ SequenceEmpty::SequenceEmpty(const OpKernelInfo& info) : OpKernel(info) {
 
 Status SequenceEmpty::Compute(OpKernelContext* context) const {
   auto* Y = context->Output<TensorSeq>(0);
-  ORT_ENFORCE(Y != nullptr, "SequenceEmpty: Got nullptr for output sequence");
+
   MLDataType seq_dtype{};
   switch (dtype_) {
     case ONNX_NAMESPACE::TensorProto_DataType_FLOAT:
@@ -195,10 +192,7 @@ Status CreateCopyAndAppendCpuTensor(const Tensor& in_tensor, OpKernelContext* co
 
 Status SequenceInsert::Compute(OpKernelContext* context) const {
   const auto* S = context->Input<TensorSeq>(0);
-  ORT_ENFORCE(S != nullptr, "Got nullptr for sequence input.");
-
   const auto* X = context->Input<Tensor>(1);
-  ORT_ENFORCE(X != nullptr, "Got nullptr for input tensor.");
 
   // Data type of the input tensor MUST be same as that of the input sequence
   if (!S->IsSameDataType(*X)) {
@@ -223,7 +217,7 @@ Status SequenceInsert::Compute(OpKernelContext* context) const {
   }
 
   auto* Y = context->Output<TensorSeq>(0);
-  ORT_ENFORCE(Y != nullptr, "SequenceInsert: Got nullptr for output sequence");
+
   std::vector<Tensor> tensors;
   tensors.reserve(num_tensors_input_seq + 1);
   for (int i = 0; i < num_tensors_input_seq; ++i) {
@@ -257,9 +251,8 @@ ONNX_CPU_OPERATOR_KERNEL(
 
 Status SequenceErase::Compute(OpKernelContext* context) const {
   const auto* S = context->Input<TensorSeq>(0);
-  ORT_ENFORCE(S != nullptr, "Got nullptr for sequence input.");
-
   const auto* I = context->Input<Tensor>(1);
+
   int64_t num_tensors_input_seq = static_cast<int64_t>(S->Size());
   int64_t input_seq_idx = num_tensors_input_seq - 1;  // default is erase last one
   if (I) {                                            // position is optional
@@ -275,8 +268,8 @@ Status SequenceErase::Compute(OpKernelContext* context) const {
   }
 
   auto* Y = context->Output<TensorSeq>(0);
-  ORT_ENFORCE(Y != nullptr, "SequenceErase: Got nullptr for output sequence");
   Y->SetType(S->DataType());
+
   std::vector<Tensor> tensors;
   tensors.reserve(num_tensors_input_seq - 1);
   for (int i = 0; i < num_tensors_input_seq; ++i) {
@@ -303,7 +296,6 @@ Status SequenceConstruct::Compute(OpKernelContext* context) const {
   ORT_ENFORCE(num_inputs >= 1, "Must have 1 or more inputs");
 
   auto* Y = context->Output<TensorSeq>(0);
-  ORT_ENFORCE(Y != nullptr, "SequenceConstruct: Got nullptr for output sequence");
 
   MLDataType first_dtype = context->Input<Tensor>(0)->DataType();
   // Before copying check if all tensors are of the same type.

diff --git a/onnxruntime/core/providers/cuda/tensor/sequence_op.h b/onnxruntime/core/providers/cuda/tensor/sequence_op.h
@@ -35,7 +35,6 @@ class SequenceAt final : public CudaKernel {
     const void* source_addr = source_tensor.DataRaw(source_type);
 
     Tensor* target_tensor = context->Output(0, source_tensor.Shape());
-    ORT_ENFORCE(target_tensor != nullptr, "SequenceAt GPU: Got nullptr for output tensor.");
     void* target_addr = target_tensor->MutableDataRaw(source_type);
 
     if (source_addr != target_addr) {
@@ -52,27 +51,33 @@ class SequenceConstruct final : public CudaKernel {
  public:
   SequenceConstruct(const OpKernelInfo& info) : CudaKernel(info) {}
   Status ComputeInternal(OpKernelContext* context) const override {
-    TensorSeq* Y = context->Output<TensorSeq>(0);
+    auto num_inputs = Node().InputArgCount().front();
+    ORT_ENFORCE(num_inputs >= 1, "Must have 1 or more inputs");
+
+    MLDataType first_dtype = context->Input<Tensor>(0)->DataType();
 
     AllocatorPtr alloc;
     ORT_ENFORCE(context->GetTempSpaceAllocator(&alloc).IsOK(),
                 "SequenceConstruct GPU: Unable to get an allocator.");
 
-    int32_t at = 0;
-    const Tensor* source_tensor = nullptr;
-    while (nullptr != (source_tensor = context->Input<Tensor>(at++))) {
-      if (1 == at) {
-        Y->SetType(source_tensor->DataType());
-      }
+    TensorSeq* Y = context->Output<TensorSeq>(0);
+    Y->SetType(first_dtype);
+    Y->Reserve(num_inputs);
+
+    for (int input_idx = 0; input_idx < num_inputs; ++input_idx) {
+      const auto* source_tensor = context->Input<Tensor>(input_idx);
+
       std::unique_ptr<Tensor> target_tensor = Tensor::Create(source_tensor->DataType(),
                                                              source_tensor->Shape(), alloc);
-      ORT_ENFORCE(target_tensor, "SequenceConstruct GPU: Failed to allocate new tensor.");
+
       CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(target_tensor->MutableDataRaw(),
                                            source_tensor->DataRaw(),
                                            source_tensor->SizeInBytes(),
                                            cudaMemcpyDeviceToDevice, Stream()));
-      Y->Add(std::move(*target_tensor));  // Add will check type consistency inside
+
+      Y->Add(std::move(*target_tensor));  // Add will check for type consistency
     }
+
     return Status::OK();
   }
 };  // SequenceConstruct
@@ -179,9 +184,11 @@ class SequenceErase final : public CudaKernel {
     AllocatorPtr alloc;
     ORT_ENFORCE(context->GetTempSpaceAllocator(&alloc).IsOK(),
                 "SequenceErase GPU: Unable to get an allocator.");
+
     TensorSeq* Y = context->Output<TensorSeq>(0);
-    ORT_ENFORCE(Y != nullptr, "SequenceErase GPU: Failed to allocate output tensor sequence.");
     Y->SetType(X->DataType());
+    Y->Reserve(X_size - 1);
+
     for (int64_t i = 0; i < X_size; ++i) {
       if (i == idx) {
         continue;
@@ -190,13 +197,13 @@ class SequenceErase final : public CudaKernel {
       std::unique_ptr<Tensor> target_tensor = Tensor::Create(source_tensor.DataType(),
                                                              source_tensor.Shape(), alloc);
 
-      ORT_ENFORCE(target_tensor, "SequenceErase GPU: Failed to allocate new tensor.");
       CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(target_tensor->MutableDataRaw(),
                                            source_tensor.DataRaw(),
                                            source_tensor.SizeInBytes(),
                                            cudaMemcpyDeviceToDevice, Stream()));
-      Y->Add(std::move(*target_tensor));
+      Y->Add(std::move(*target_tensor));  // Add will check for type consistency
     }
+
     return Status::OK();
   }
 };  // SequenceErase
@@ -223,44 +230,39 @@ class SequenceInsert final : public CudaKernel {
       ORT_ENFORCE(idx >= 0 && idx <= S_size, "SequenceInsert GPU: Invalid sequence index.");
     }
     const Tensor* X = context->Input<Tensor>(1);
-    ORT_ENFORCE(X != nullptr, "SequenceInsert GPU: Got nullptr for tensor input.");
 
     AllocatorPtr alloc;
     ORT_ENFORCE(context->GetTempSpaceAllocator(&alloc).IsOK(),
                 "SequenceInsert GPU: Unable to get an allocator.");
 
+    std::unique_ptr<Tensor> tensor_to_be_inserted = Tensor::Create(X->DataType(),
+                                                                   X->Shape(), alloc);
+    CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(tensor_to_be_inserted->MutableDataRaw(),
+                                         X->DataRaw(), X->SizeInBytes(),
+                                         cudaMemcpyDeviceToDevice, Stream()));
+
     TensorSeq* Y = context->Output<TensorSeq>(0);
-    ORT_ENFORCE(Y != nullptr, "SequenceInsert GPU: Failed to allocate output tensor sequence.");
     Y->SetType(S->DataType());
+    Y->Reserve(S_size + 1);
+
     for (int64_t i = 0; i < S_size; ++i) {
       if (i == idx) {
-        std::unique_ptr<Tensor> target_tensor = Tensor::Create(X->DataType(),
-                                                               X->Shape(), alloc);
-        ORT_ENFORCE(target_tensor, "SequenceInsert GPU: Failed to allocate new tensor.");
-        CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(target_tensor->MutableDataRaw(),
-                                             X->DataRaw(), X->SizeInBytes(),
-                                             cudaMemcpyDeviceToDevice, Stream()));
-        Y->Add(std::move(*target_tensor));
+        Y->Add(std::move(*tensor_to_be_inserted));  // Add will check for type consistency
       }
       const Tensor& source_tensor = S->Get(i);
       std::unique_ptr<Tensor> target_tensor = Tensor::Create(source_tensor.DataType(),
                                                              source_tensor.Shape(), alloc);
-      ORT_ENFORCE(target_tensor, "SequenceInsert GPU: Failed to allocate new tensor.");
       CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(target_tensor->MutableDataRaw(),
                                            source_tensor.DataRaw(),
                                            source_tensor.SizeInBytes(),
                                            cudaMemcpyDeviceToDevice, Stream()));
-      Y->Add(std::move(*target_tensor));  // Add will check type consistency inside
-    }                                     // for
+      Y->Add(std::move(*target_tensor));  // Add will check for type consistency
+    }
+
     if (idx == S_size) {
-      std::unique_ptr<Tensor> target_tensor = Tensor::Create(X->DataType(),
-                                                             X->Shape(), alloc);
-      ORT_ENFORCE(target_tensor, "SequenceInsert GPU: Failed to allocate new tensor.");
-      CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(target_tensor->MutableDataRaw(),
-                                           X->DataRaw(), X->SizeInBytes(),
-                                           cudaMemcpyDeviceToDevice, Stream()));
-      Y->Add(std::move(*target_tensor));
+      Y->Add(std::move(*tensor_to_be_inserted));  // Add will check for type consistency
     }
+
     return Status::OK();
   }
 };  // SequenceInsert

diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h
@@ -750,6 +750,7 @@ struct ProviderHost {
   virtual size_t TensorSeq__Size(const TensorSeq* p) noexcept = 0;
   virtual const Tensor& TensorSeq__Get(const TensorSeq* p, size_t i) = 0;
   virtual void TensorSeq__Add(TensorSeq* p, Tensor&& tensor) = 0;
+  virtual void TensorSeq__Reserve(TensorSeq* p, size_t capacity) = 0;
 
   // AllocatorManager
   virtual void AllocatorManager__InsertAllocator(AllocatorManager* p, AllocatorPtr allocator) = 0;

diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
@@ -951,9 +951,10 @@ struct TensorSeq final {
   size_t Size() const noexcept { return g_host->TensorSeq__Size(this); }
   const Tensor& Get(size_t i) const { return g_host->TensorSeq__Get(this, i); }
   void Add(Tensor&& tensor) { g_host->TensorSeq__Add(this, std::move(tensor)); }
+  void Reserve(size_t capacity) { g_host->TensorSeq__Reserve(this, capacity); }
 };
 
 template <>
 inline gsl::span<const int64_t> Tensor::DataAsSpan() const { return g_host->Tensor__DataAsSpan_int64(this); }
 
-}  // namespace onnxruntime
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc
@@ -839,6 +839,7 @@ struct ProviderHostImpl : ProviderHost {
   size_t TensorSeq__Size(const TensorSeq* p) noexcept override { return p->Size(); }
   const Tensor& TensorSeq__Get(const TensorSeq* p, size_t i) override { return p->Get(i); }
   void TensorSeq__Add(TensorSeq* p, Tensor&& tensor) override { p->Add(std::move(tensor)); }
+  void TensorSeq__Reserve(TensorSeq* p, size_t capacity) override { p->Reserve(capacity); }
 
   // AllocatorManager (direct)
   void AllocatorManager__InsertAllocator(AllocatorManager* p, AllocatorPtr allocator) override { p->AllocatorManager::InsertAllocator(allocator); }