From f86d35a269a94e8af7bec5945be01ab0acd76730 Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Mon, 16 Apr 2018 17:11:11 +0800
Subject: [PATCH 1/9] add sharable tensor

---
 paddle/fluid/framework/tensor.h      | 29 ++++++++++++++++++++++++++
 paddle/fluid/framework/tensor_impl.h | 31 ++++++++++++++++++++++++++++
 2 files changed, 60 insertions(+)
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index 6f878541e6de1d..1e5c68a1b9d4bb 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -98,6 +98,9 @@ class Tensor {
   /*! The internal of two tensors share the same memory block. */
   inline Tensor& ShareDataWith(const Tensor& src);
 
+  /*! Share part of the memory of the two tensors */
+  inline Tensor& ShareDataWith(Tensor* src, size_t offset);
+
   /**
    * @brief  Return a sub-tensor of the given tensor.
    *
@@ -176,6 +179,32 @@ class Tensor {
     std::type_index type_;
   };
 
+  template <typename Place>
+  struct SharedPlaceholderImpl : public Placeholder {
+    SharedPlaceholderImpl(Place place, uint8_t* data, size_t size,
+                          std::type_index type)
+        : ptr_(data), place_(place), size_(size), type_(type) {}
+
+    virtual size_t size() const { return size_; }
+    virtual platform::Place place() const { return place_; }
+    virtual void* ptr() const { return static_cast<void*>(ptr_); }
+    virtual std::type_index type() const { return type_; }
+    virtual void set_type(std::type_index type) { type_ = type; }
+    virtual void set_place(platform::Place place) { place_ = place; }
+
+    /*! the pointer of memory block. */
+    uint8_t* ptr_;
+
+    /*! the place of memory block. */
+    platform::Place place_;
+
+    /*! the size of memory block. */
+    size_t size_;
+
+    /* the current type of memory */
+    std::type_index type_;
+  };
+
   /*! holds the memory block if allocated. */
   std::shared_ptr<Placeholder> holder_;
 
diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h
index f49d1a47a325b2..98d53fd1e7db95 100644
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -162,6 +162,37 @@ inline Tensor& Tensor::ShareDataWith(const Tensor& src) {
   return *this;
 }
 
+inline Tensor& Tensor::ShareDataWith(Tensor* src, size_t offset) {
+  // NOTE: data size is determined by current tensor shape and data type
+  src->check_memory_size();
+  PADDLE_ENFORCE_EQ(src->type(), this->type(),
+                    "tensor data type must be the same when sharing data");
+  auto place = src->place();
+  auto type = src->type();
+  size_t size = src->numel() * SizeOfType(src->type());
+  auto* ref = static_cast<uint8_t*>(src->mutable_data(place)) + offset;
+  if (platform::is_cpu_place(place)) {
+    holder_.reset(new SharedPlaceholderImpl<platform::CPUPlace>(
+        boost::get<platform::CPUPlace>(place), ref, size, type));
+  } else if (platform::is_gpu_place(place) ||
+             platform::is_cuda_pinned_place(place)) {
+#ifndef PADDLE_WITH_CUDA
+    PADDLE_THROW(
+        "CUDAPlace or CUDAPinnedPlace is not supported in CPU-only mode.");
+  }
+#else
+    if (platform::is_gpu_place(place)) {
+      holder_.reset(new SharedPlaceholderImpl<platform::CUDAPlace>(
+          boost::get<platform::CUDAPlace>(place), ref, size, type));
+    } else if (platform::is_cuda_pinned_place(place)) {
+      holder_.reset(new SharedPlaceholderImpl<platform::CUDAPinnedPlace>(
+          boost::get<platform::CUDAPinnedPlace>(place), ref, size, type));
+    }
+  }
+#endif
+  return *this;
+}
+
 inline Tensor Tensor::Slice(int begin_idx, int end_idx) const {
   check_memory_size();
   PADDLE_ENFORCE_GE(begin_idx, 0,

From 04c559e3aad8510fb6abfb9e469449913971266c Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Mon, 16 Apr 2018 20:32:18 +0800
Subject: [PATCH 2/9] wip split byref op

---
 paddle/fluid/framework/tensor.h             |  10 +-
 paddle/fluid/framework/tensor_impl.h        |   4 +-
 paddle/fluid/operators/split_byref_op.cc    | 101 ++++++++++++++++++++
 paddle/fluid/operators/split_byref_op.cu.cc |  18 ++++
 paddle/fluid/operators/split_byref_op.h     |  43 +++++++++
 paddle/fluid/operators/split_op.cc          |  15 ---
 paddle/fluid/operators/split_op.h           |  15 +++
 7 files changed, 185 insertions(+), 21 deletions(-)
 create mode 100644 paddle/fluid/operators/split_byref_op.cc
 create mode 100644 paddle/fluid/operators/split_byref_op.cu.cc
 create mode 100644 paddle/fluid/operators/split_byref_op.h

diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index 1e5c68a1b9d4bb..f30dcc000b7142 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -99,7 +99,7 @@ class Tensor {
   inline Tensor& ShareDataWith(const Tensor& src);
 
   /*! Share part of the memory of the two tensors */
-  inline Tensor& ShareDataWith(Tensor* src, size_t offset);
+  inline Tensor& ShareDataWith(const Tensor* src, size_t offset);
 
   /**
    * @brief  Return a sub-tensor of the given tensor.
@@ -181,19 +181,21 @@ class Tensor {
 
   template <typename Place>
   struct SharedPlaceholderImpl : public Placeholder {
-    SharedPlaceholderImpl(Place place, uint8_t* data, size_t size,
+    SharedPlaceholderImpl(Place place, const uint8_t* data, size_t size,
                           std::type_index type)
         : ptr_(data), place_(place), size_(size), type_(type) {}
 
     virtual size_t size() const { return size_; }
     virtual platform::Place place() const { return place_; }
-    virtual void* ptr() const { return static_cast<void*>(ptr_); }
+    virtual void* ptr() const {
+      return const_cast<void*>(static_cast<const void*>(ptr_));
+    }
     virtual std::type_index type() const { return type_; }
     virtual void set_type(std::type_index type) { type_ = type; }
     virtual void set_place(platform::Place place) { place_ = place; }
 
     /*! the pointer of memory block. */
-    uint8_t* ptr_;
+    const uint8_t* ptr_;
 
     /*! the place of memory block. */
     platform::Place place_;
diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h
index 98d53fd1e7db95..a177ef74166f20 100644
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -162,7 +162,7 @@ inline Tensor& Tensor::ShareDataWith(const Tensor& src) {
   return *this;
 }
 
-inline Tensor& Tensor::ShareDataWith(Tensor* src, size_t offset) {
+inline Tensor& Tensor::ShareDataWith(const Tensor* src, size_t offset) {
   // NOTE: data size is determined by current tensor shape and data type
   src->check_memory_size();
   PADDLE_ENFORCE_EQ(src->type(), this->type(),
@@ -170,7 +170,7 @@ inline Tensor& Tensor::ShareDataWith(Tensor* src, size_t offset) {
   auto place = src->place();
   auto type = src->type();
   size_t size = src->numel() * SizeOfType(src->type());
-  auto* ref = static_cast<uint8_t*>(src->mutable_data(place)) + offset;
+  auto* ref = src->data<uint8_t>() + offset;
   if (platform::is_cpu_place(place)) {
     holder_.reset(new SharedPlaceholderImpl<platform::CPUPlace>(
         boost::get<platform::CPUPlace>(place), ref, size, type));
diff --git a/paddle/fluid/operators/split_byref_op.cc b/paddle/fluid/operators/split_byref_op.cc
new file mode 100644
index 00000000000000..7413ce3e9ce60e
--- /dev/null
+++ b/paddle/fluid/operators/split_byref_op.cc
@@ -0,0 +1,101 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/split_byref_op.h"
+#include "paddle/fluid/operators/split_op.h"
+
+namespace paddle {
+namespace operators {
+using framework::Tensor;
+
+class SplitByrefOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SplitOp should not be null.");
+    PADDLE_ENFORCE_GE(ctx->Outputs("Out").size(), 1UL,
+                      "Outputs(Out) of SplitOp should not be empty.");
+    auto in_dims = ctx->GetInputDim("X");
+    auto outs_names = ctx->Outputs("Out");
+    size_t num = static_cast<size_t>(ctx->Attrs().Get<int>("num"));
+    std::vector<int> sections = static_cast<std::vector<int>>(
+        ctx->Attrs().Get<std::vector<int>>("sections"));
+    const size_t outs_number = outs_names.size();
+    std::vector<framework::DDim> outs_dims;
+    outs_dims.reserve(outs_number);
+
+    if (num > 0) {
+      int64_t in_axis_dim = in_dims[0];
+      PADDLE_ENFORCE_EQ(in_axis_dim % num, 0,
+                        "tensor split does not result"
+                        " in an equal division");
+      size_t out_axis_dim = in_axis_dim / num;
+      for (size_t i = 0; i < outs_number; ++i) {
+        auto dim = in_dims;
+        dim[0] = out_axis_dim;
+        outs_dims.push_back(dim);
+      }
+    } else if (sections.size() > 0) {
+      PADDLE_ENFORCE_EQ(sections.size(), outs_number,
+                        "tensor split sections size"
+                        "should be equal to output size.");
+      for (size_t i = 0; i < outs_number; ++i) {
+        auto dim = in_dims;
+        dim[0] = sections[i];
+        outs_dims.push_back(dim);
+      }
+    }
+    ctx->SetOutputsDim("Out", outs_dims);
+  }
+};
+
+class SplitByrefOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SplitByrefOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor) Input tensor of the split operator.");
+    AddOutput("Out", "(Tensor) Output tensors of the split operator.")
+        .AsDuplicable();
+    AddComment(R"DOC(
+SplitByref operator
+
+Split source tensor to sevaral tensors by axis 0. No copy in this operator
+is performed, output tensor shares the same blocks of memory.
+)DOC");
+    AddAttr<std::vector<int>>("sections",
+                              "(vector<int>) "
+                              "the length of each output along the "
+                              "specified axis.")
+        .SetDefault(std::vector<int>{});
+    AddAttr<int>("num",
+                 "(int, default 0)"
+                 "Number of sub-tensors. This must evenly divide "
+                 "Input.dims()[axis]")
+        .SetDefault(0);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+// NOTE: concat op default axis must be 0!
+USE_CPU_ONLY_OP(concat);
+
+REGISTER_OPERATOR(split_byref, ops::SplitByrefOp, ops::SplitByrefOpMaker,
+                  ops::SplitGradMaker);
+REGISTER_OP_CPU_KERNEL(
+    split_byref, ops::SplitByrefOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/fluid/operators/split_byref_op.cu.cc b/paddle/fluid/operators/split_byref_op.cu.cc
new file mode 100644
index 00000000000000..1faf4f55dd54a2
--- /dev/null
+++ b/paddle/fluid/operators/split_byref_op.cu.cc
@@ -0,0 +1,18 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/split_byref_op.h"
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    split, ops::SplitByrefOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/split_byref_op.h b/paddle/fluid/operators/split_byref_op.h
new file mode 100644
index 00000000000000..7c3ab1c1b9d955
--- /dev/null
+++ b/paddle/fluid/operators/split_byref_op.h
@@ -0,0 +1,43 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class SplitByrefOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
+    auto in_stride = framework::stride_numel(in->dims());
+    auto place = ctx.GetPlace();
+
+    size_t input_offset = 0;
+    for (size_t i = 0; i < outs.size(); ++i) {
+      // NOTE: no need to call mutable_data here to allocate memory.
+      auto* out = outs[i];
+      out->ShareDataWith(in, input_offset);
+      input_offset += out->numel() * framework::SizeOfType(out->type());
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/split_op.cc b/paddle/fluid/operators/split_op.cc
index e745509ec8c1f2..a4398df36bcc2d 100644
--- a/paddle/fluid/operators/split_op.cc
+++ b/paddle/fluid/operators/split_op.cc
@@ -108,21 +108,6 @@ This operator splits the input tensor into multiple sub-tensors.
   }
 };
 
-class SplitGradMaker : public framework::SingleGradOpDescMaker {
- public:
-  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
-
- protected:
-  std::unique_ptr<framework::OpDesc> Apply() const override {
-    auto op = new framework::OpDesc();
-    op->SetType("concat");
-    op->SetInput("X", OutputGrad("Out"));
-    op->SetOutput("Out", InputGrad("X"));
-    op->SetAttrMap(Attrs());
-    return std::unique_ptr<framework::OpDesc>(op);
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/split_op.h b/paddle/fluid/operators/split_op.h
index e2c41f44ab3ea3..f0c417c70521b1 100644
--- a/paddle/fluid/operators/split_op.h
+++ b/paddle/fluid/operators/split_op.h
@@ -44,5 +44,20 @@ class SplitOpKernel : public framework::OpKernel<T> {
   }
 };
 
+class SplitGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto op = new framework::OpDesc();
+    op->SetType("concat");
+    op->SetInput("X", OutputGrad("Out"));
+    op->SetOutput("Out", InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(op);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle

From 948628563f5313bce5c497c4cfd80f3d3d7774f8 Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Tue, 17 Apr 2018 15:04:10 +0800
Subject: [PATCH 3/9] update

---
 paddle/fluid/operators/split_byref_op.cu.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/split_byref_op.cu.cc b/paddle/fluid/operators/split_byref_op.cu.cc
index 1faf4f55dd54a2..5ee6186f3541b7 100644
--- a/paddle/fluid/operators/split_byref_op.cu.cc
+++ b/paddle/fluid/operators/split_byref_op.cu.cc
@@ -15,4 +15,5 @@ limitations under the License. */
 #include "paddle/fluid/operators/split_byref_op.h"
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
-    split, ops::SplitByrefOpKernel<paddle::platform::CUDADeviceContext, float>);
+    split_byref,
+    ops::SplitByrefOpKernel<paddle::platform::CUDADeviceContext, float>);

From 0c6eef3e58b3cdc182d1d8531eb227abc065857f Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Tue, 17 Apr 2018 15:48:05 +0800
Subject: [PATCH 4/9] add split by ref test

---
 python/paddle/fluid/tests/unittests/test_split_op.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_split_op.py b/python/paddle/fluid/tests/unittests/test_split_op.py
index 887bdfe8b36088..5a7123c36b17a3 100644
--- a/python/paddle/fluid/tests/unittests/test_split_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_op.py
@@ -19,7 +19,6 @@
 
 class TestSplitOp(OpTest):
     def setUp(self):
-        self.op_type = "split"
         axis = 1
         x = np.random.random((4, 5, 6)).astype('float32')
         out = np.split(x, [2, 3], axis)
@@ -28,6 +27,9 @@ def setUp(self):
         self.outputs = {'Out': [('out%d' % i, out[i]) \
             for i in xrange(len(out))]}
 
+    def _set_op_type(self):
+        self.op_type = "split"
+
     def test_check_output(self):
         self.check_output()
 
@@ -35,5 +37,10 @@ def test_check_grad(self):
         self.check_grad(['X'], ['out0', 'out1', 'out2'])
 
 
+class TestSplitByrefOp(OpTest):
+    def _set_op_type(self):
+        self.op_type = "split_byref"
+
+
 if __name__ == '__main__':
     unittest.main()

From ed89b7b7e6f7651a852be5cafdc1264f46bed65a Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Tue, 17 Apr 2018 17:23:02 +0800
Subject: [PATCH 5/9] dist train use split_by_ref

---
 python/paddle/fluid/distribute_transpiler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/distribute_transpiler.py b/python/paddle/fluid/distribute_transpiler.py
index aa15392d7e4901..0c21f859a7a205 100644
--- a/python/paddle/fluid/distribute_transpiler.py
+++ b/python/paddle/fluid/distribute_transpiler.py
@@ -824,7 +824,7 @@ def _append_split_op(self, program, gradblocks):
                 for v in splited_vars:
                     sections.append(v.shape[0])
                 program.global_block().append_op(
-                    type="split",
+                    type="split_byref",
                     inputs={"X": orig_var},
                     outputs={"Out": splited_vars},
                     attrs={"sections": sections}  # assume split evenly

From 69188e59811d03be9d4e90ccf15d1203684a4607 Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Tue, 17 Apr 2018 20:02:44 +0800
Subject: [PATCH 6/9] fix ut

---
 python/paddle/fluid/tests/unittests/test_split_op.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/paddle/fluid/tests/unittests/test_split_op.py b/python/paddle/fluid/tests/unittests/test_split_op.py
index 5a7123c36b17a3..eb49a53e54f4bd 100644
--- a/python/paddle/fluid/tests/unittests/test_split_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_op.py
@@ -19,6 +19,7 @@
 
 class TestSplitOp(OpTest):
     def setUp(self):
+        self._set_op_type()
         axis = 1
         x = np.random.random((4, 5, 6)).astype('float32')
         out = np.split(x, [2, 3], axis)

From 788636f078fae8b9b68e3afcf8e0eee5f52bc4fc Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Wed, 18 Apr 2018 13:28:41 +0800
Subject: [PATCH 7/9] update by comments

---
 paddle/fluid/framework/tensor.h         |  3 ---
 paddle/fluid/framework/tensor_impl.h    | 31 -------------------------
 paddle/fluid/operators/split_byref_op.h |  7 +++---
 3 files changed, 3 insertions(+), 38 deletions(-)

diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index f30dcc000b7142..5a6b24bfafbe76 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -98,9 +98,6 @@ class Tensor {
   /*! The internal of two tensors share the same memory block. */
   inline Tensor& ShareDataWith(const Tensor& src);
 
-  /*! Share part of the memory of the two tensors */
-  inline Tensor& ShareDataWith(const Tensor* src, size_t offset);
-
   /**
    * @brief  Return a sub-tensor of the given tensor.
    *
diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h
index a177ef74166f20..f49d1a47a325b2 100644
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -162,37 +162,6 @@ inline Tensor& Tensor::ShareDataWith(const Tensor& src) {
   return *this;
 }
 
-inline Tensor& Tensor::ShareDataWith(const Tensor* src, size_t offset) {
-  // NOTE: data size is determined by current tensor shape and data type
-  src->check_memory_size();
-  PADDLE_ENFORCE_EQ(src->type(), this->type(),
-                    "tensor data type must be the same when sharing data");
-  auto place = src->place();
-  auto type = src->type();
-  size_t size = src->numel() * SizeOfType(src->type());
-  auto* ref = src->data<uint8_t>() + offset;
-  if (platform::is_cpu_place(place)) {
-    holder_.reset(new SharedPlaceholderImpl<platform::CPUPlace>(
-        boost::get<platform::CPUPlace>(place), ref, size, type));
-  } else if (platform::is_gpu_place(place) ||
-             platform::is_cuda_pinned_place(place)) {
-#ifndef PADDLE_WITH_CUDA
-    PADDLE_THROW(
-        "CUDAPlace or CUDAPinnedPlace is not supported in CPU-only mode.");
-  }
-#else
-    if (platform::is_gpu_place(place)) {
-      holder_.reset(new SharedPlaceholderImpl<platform::CUDAPlace>(
-          boost::get<platform::CUDAPlace>(place), ref, size, type));
-    } else if (platform::is_cuda_pinned_place(place)) {
-      holder_.reset(new SharedPlaceholderImpl<platform::CUDAPinnedPlace>(
-          boost::get<platform::CUDAPinnedPlace>(place), ref, size, type));
-    }
-  }
-#endif
-  return *this;
-}
-
 inline Tensor Tensor::Slice(int begin_idx, int end_idx) const {
   check_memory_size();
   PADDLE_ENFORCE_GE(begin_idx, 0,
diff --git a/paddle/fluid/operators/split_byref_op.h b/paddle/fluid/operators/split_byref_op.h
index 7c3ab1c1b9d955..9b54c7c74acb51 100644
--- a/paddle/fluid/operators/split_byref_op.h
+++ b/paddle/fluid/operators/split_byref_op.h
@@ -26,15 +26,14 @@ class SplitByrefOpKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* in = ctx.Input<framework::Tensor>("X");
     auto outs = ctx.MultiOutput<framework::Tensor>("Out");
-    auto in_stride = framework::stride_numel(in->dims());
     auto place = ctx.GetPlace();
 
-    size_t input_offset = 0;
+    size_t row_offset = 0;
     for (size_t i = 0; i < outs.size(); ++i) {
       // NOTE: no need to call mutable_data here to allocate memory.
       auto* out = outs[i];
-      out->ShareDataWith(in, input_offset);
-      input_offset += out->numel() * framework::SizeOfType(out->type());
+      *out = std::move(in->Slice(row_offset, out->dims()[0]));
+      row_offset += out->dims()[0];
     }
   }
 };

From 184835856c94043a5c27f5da3921cdaba433273c Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Wed, 18 Apr 2018 14:44:17 +0800
Subject: [PATCH 8/9] fix copy size

---
 paddle/fluid/operators/detail/sendrecvop_utils.cc | 9 +++++----
 paddle/fluid/operators/split_byref_op.h           | 3 ++-
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.cc b/paddle/fluid/operators/detail/sendrecvop_utils.cc
index 16c612c45a37dd..69fcffe9bc3400 100644
--- a/paddle/fluid/operators/detail/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/detail/sendrecvop_utils.cc
@@ -82,7 +82,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
         platform::CPUPlace cpu;
         auto& gpu_dev_ctx =
             static_cast<const platform::CUDADeviceContext&>(ctx);
-        auto copy_size = tensor.memory_size();
+        auto copy_size = tensor.numel() * framework::SizeOfType(tensor.type());
         payload = memory::Alloc(cpu, copy_size);
 
         memory::Copy(cpu, payload,
@@ -99,7 +99,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
       } else {
         payload = tensor.data<void>();
       }
-      payload_size = tensor.memory_size();
+      payload_size = tensor.numel() * framework::SizeOfType(tensor.type());
       e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, payload_size);
     } break;
     case framework::proto::VarType_Type_SELECTED_ROWS: {
@@ -118,7 +118,8 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
         platform::CPUPlace cpu;
         auto& gpu_dev_ctx =
             static_cast<const platform::CUDADeviceContext&>(ctx);
-        auto copy_size = tensor->memory_size();
+        auto copy_size =
+            tensor->numel() * framework::SizeOfType(tensor->type());
         payload = memory::Alloc(cpu, copy_size);
         memory::Copy(cpu, payload,
                      boost::get<platform::CUDAPlace>(tensor->place()),
@@ -133,7 +134,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
       } else {
         payload = slr->mutable_value()->data<void>();
       }
-      payload_size = tensor->memory_size();
+      payload_size = tensor->numel() * framework::SizeOfType(tensor->type());
       e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, payload_size);
     } break;
     default:
diff --git a/paddle/fluid/operators/split_byref_op.h b/paddle/fluid/operators/split_byref_op.h
index 9b54c7c74acb51..a3aad68ea736e2 100644
--- a/paddle/fluid/operators/split_byref_op.h
+++ b/paddle/fluid/operators/split_byref_op.h
@@ -32,7 +32,8 @@ class SplitByrefOpKernel : public framework::OpKernel<T> {
     for (size_t i = 0; i < outs.size(); ++i) {
       // NOTE: no need to call mutable_data here to allocate memory.
       auto* out = outs[i];
-      *out = std::move(in->Slice(row_offset, out->dims()[0]));
+      VLOG(3) << "spliting by ref: " << row_offset << " " << out->dims()[0];
+      *out = std::move(in->Slice(row_offset, row_offset + out->dims()[0]));
       row_offset += out->dims()[0];
     }
   }

From ff0d9341ead47b7880d8d34e600b6bcd6a31c52e Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Wed, 18 Apr 2018 18:46:21 +0800
Subject: [PATCH 9/9] remove not used code

---
 paddle/fluid/framework/tensor.h | 28 ----------------------------
 1 file changed, 28 deletions(-)

diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index 5a6b24bfafbe76..6f878541e6de1d 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -176,34 +176,6 @@ class Tensor {
     std::type_index type_;
   };
 
-  template <typename Place>
-  struct SharedPlaceholderImpl : public Placeholder {
-    SharedPlaceholderImpl(Place place, const uint8_t* data, size_t size,
-                          std::type_index type)
-        : ptr_(data), place_(place), size_(size), type_(type) {}
-
-    virtual size_t size() const { return size_; }
-    virtual platform::Place place() const { return place_; }
-    virtual void* ptr() const {
-      return const_cast<void*>(static_cast<const void*>(ptr_));
-    }
-    virtual std::type_index type() const { return type_; }
-    virtual void set_type(std::type_index type) { type_ = type; }
-    virtual void set_place(platform::Place place) { place_ = place; }
-
-    /*! the pointer of memory block. */
-    const uint8_t* ptr_;
-
-    /*! the place of memory block. */
-    platform::Place place_;
-
-    /*! the size of memory block. */
-    size_t size_;
-
-    /* the current type of memory */
-    std::type_index type_;
-  };
-
   /*! holds the memory block if allocated. */
   std::shared_ptr<Placeholder> holder_;