diff --git a/include/tvm/runtime/ndarray.h b/include/tvm/runtime/ndarray.h
index a4c285e3dd08..e80ed5fb1f8f 100644
--- a/include/tvm/runtime/ndarray.h
+++ b/include/tvm/runtime/ndarray.h
@@ -155,6 +155,24 @@ class NDArray : public ObjectRef {
    */
   TVM_DLL static NDArray Empty(ShapeTuple shape, DLDataType dtype, Device dev,
                                Optional<String> mem_scope = NullOpt);
+  /*!
+   * \brief Create a NDArray backed by an external DLTensor.
+   *
+   * This allows us to create a NDArray using the memory
+   * allocated by an external source. Responsibility for memory
+   * retaining lies with the external source.
+   * \param dl_tensor The DLTensor to copy from.
+   * \return The created NDArray view.
+   */
+  TVM_DLL static NDArray FromExternalDLTensor(const DLTensor& dl_tensor);
+  /*!
+   * \brief Create new NDArray, data is copied from DLTensor.
+   *
+   * \param dl_tensor The DLTensor to copy from.
+   * \param dev device location of the created NDArray.
+   * \return The created NDArray view.
+   */
+  TVM_DLL static NDArray NewFromDLTensor(DLTensor* dl_tensor, Device dev);
   /*!
    * \brief Create a NDArray backed by a dlpack tensor.
    *
diff --git a/python/tvm/runtime/vm.py b/python/tvm/runtime/vm.py
index 0592368f6b0a..6e59c3455a91 100644
--- a/python/tvm/runtime/vm.py
+++ b/python/tvm/runtime/vm.py
@@ -426,6 +426,10 @@ def _setup_device(self, dev, memory_cfg):
 
     def set_input(self, func_name, *args, **kwargs):
         """Set the input to a function.
+        If device type and device id for input tensor are the same as
+        for target one the zero copy is used. It means that internal
+        tensor is reference to memory allocated by input one.
+        Otherwise new internal NDarray is created and data is copied
 
         Parameters
         ----------
diff --git a/src/runtime/ndarray.cc b/src/runtime/ndarray.cc
index 3b75540f8763..f44dc86f902a 100644
--- a/src/runtime/ndarray.cc
+++ b/src/runtime/ndarray.cc
@@ -121,6 +121,13 @@ struct NDArray::Internal {
     }
     delete ptr;
   }
+  // Deleter for NDArray based on external DLTensor
+  // The memory is allocated from outside and it is assumed that
+  // responsibility for its freeing is also outside
+  static void SelfDeleter(Object* ptr_obj) {
+    auto* ptr = static_cast<NDArray::Container*>(ptr_obj);
+    delete ptr;
+  }
   // Local create function which allocates tensor metadata
   // but does not allocate space for the data.
   static NDArray Create(ShapeTuple shape, DLDataType dtype, Device dev) {
@@ -198,6 +205,30 @@ NDArray NDArray::Empty(ShapeTuple shape, DLDataType dtype, Device dev, Optional<
   return ret;
 }
 
+NDArray NDArray::FromExternalDLTensor(const DLTensor& dl_tensor) {
+  NDArray::Container* data = new NDArray::Container();
+
+  data->SetDeleter(Internal::SelfDeleter);
+  data->dl_tensor = dl_tensor;
+  std::vector<ShapeTuple::index_type> shape;
+  shape.resize(data->dl_tensor.ndim);
+  shape.assign(data->dl_tensor.shape, data->dl_tensor.shape + data->dl_tensor.ndim);
+  data->shape_ = ShapeTuple(shape);
+  data->dl_tensor.shape = const_cast<ShapeTuple::index_type*>(data->shape_.data());
+
+  return NDArray(GetObjectPtr<Object>(data));
+}
+
+NDArray NDArray::NewFromDLTensor(DLTensor* tensor, Device dev) {
+  std::vector<int64_t> shape;
+  for (int64_t i = 0; i < tensor->ndim; i++) {
+    shape.push_back(tensor->shape[i]);
+  }
+  NDArray ary = NDArray::Empty(shape, tensor->dtype, dev);
+  ary.CopyFrom(tensor);
+  return ary;
+}
+
 NDArray NDArray::FromDLPack(DLManagedTensor* tensor) {
   NDArray::Container* data = new NDArray::Container();
   // construct header
diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
index 38d793606dc4..41b9395237ee 100644
--- a/src/runtime/vm/vm.cc
+++ b/src/runtime/vm/vm.cc
@@ -70,8 +70,15 @@ inline ObjectRef CopyTo(ObjectRef src, const DLDevice& dev) {
   if (src->IsInstance<NDArray::ContainerType>()) {
     auto nd_array = Downcast<NDArray>(src);
     // TODO(mbs): Should respect device id also.
-    if (nd_array->device.device_type != dev.device_type) {
-      VLOG(2) << "copying from " << nd_array->device.device_type << " to " << dev.device_type;
+    // TODO(vvchernov): it still does not work for different device id
+    // due to simple implementation of Get() and AllocDataSpace() methods
+    // see tvm/src/runtime/c_runtime_api.cc: L139
+    // tvm/src/runtime/cpu_device_api.cc: L47
+    if (nd_array->device.device_type != dev.device_type ||
+        nd_array->device.device_id != dev.device_id) {
+      VLOG(2) << "copying from " << nd_array->device.device_type << "["
+              << nd_array->device.device_id << "] to " << dev.device_type << "[" << dev.device_id
+              << "]";
       return nd_array.CopyTo(dev);
     }
     return src;
@@ -303,13 +310,12 @@ void VirtualMachine::SetInputTensorWithIndex(std::vector<ObjectRef>& tensors,
   if (inp_tensor.type_code() == kTVMDLTensorHandle) {
     // Automatically convert input DLTensors to NDArray
     DLTensor* tensor = inp_tensor;
-    std::vector<int64_t> shape;
-    for (int64_t i = 0; i < tensor->ndim; i++) {
-      shape.push_back(tensor->shape[i]);
+    if (dev.device_type == tensor->device.device_type &&
+        dev.device_id == tensor->device.device_id) {
+      tensors[index] = NDArray::FromExternalDLTensor(*tensor);
+    } else {
+      tensors[index] = NDArray::NewFromDLTensor(tensor, dev);
     }
-    NDArray ary = NDArray::Empty(shape, tensor->dtype, dev);
-    ary.CopyFrom(tensor);
-    tensors[index] = ary;
   } else {
     tensors[index] = CopyTo(inp_tensor, dev);
   }