diff --git a/Jenkinsfile b/Jenkinsfile
index e01621dc5374..9abe9c0365bd 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -205,7 +205,7 @@ del /Q *.7z
 def python_ut(docker_type) {
   timeout(time: max_time, unit: 'MINUTES') {
     sh "${docker_run} ${docker_type} PYTHONPATH=./python/ nosetests-2.7 --with-timer --verbose tests/python/unittest"
-    // sh "${docker_run} ${docker_type} PYTHONPATH=./python/ nosetests-3.4 --with-timer --verbose tests/python/unittest"
+    sh "${docker_run} ${docker_type} PYTHONPATH=./python/ nosetests-3.4 --with-timer --verbose tests/python/unittest"
     sh "${docker_run} ${docker_type} PYTHONPATH=./python/ nosetests-2.7 --with-timer --verbose tests/python/train"
   }
 }
@@ -215,7 +215,7 @@ def python_ut(docker_type) {
 def python_gpu_ut(docker_type) {
   timeout(time: max_time, unit: 'MINUTES') {
     sh "${docker_run} ${docker_type} PYTHONPATH=./python/ nosetests-2.7 --with-timer --verbose tests/python/gpu"
-    // sh "${docker_run} ${docker_type} PYTHONPATH=./python/ nosetests-3.4 --with-timer --verbose tests/python/gpu"
+    sh "${docker_run} ${docker_type} PYTHONPATH=./python/ nosetests-3.4 --with-timer --verbose tests/python/gpu"
   }
 }
 
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 12e2a270b02e..c4304b172985 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -222,21 +222,6 @@ MXNET_DLL int MXNDArrayCreate(const mx_uint *shape,
                               int delay_alloc,
                               NDArrayHandle *out);
 
-/*!
- * \brief create a NDArray with specified sparse type, shape and aux data(e.g. index)
- * aux data is copied during construction.
- */
-MXNET_DLL int MXNDArrayCreateSparse(NDArrayHandle data,
-                    mx_uint num_aux,
-                    NDArrayHandle *aux_data,
-                    const mx_uint *shape,
-                    mx_uint ndim,
-                    int storage_type,
-                    int dev_type,
-                    int dev_id,
-                    int delay_alloc,
-                    int dtype,
-                    NDArrayHandle *out);
 /*!
  * \brief create a NDArray with specified shape and data type
  * \param shape the pointer to the shape
@@ -260,6 +245,20 @@ MXNET_DLL int MXNDArrayCreateEx(const mx_uint *shape,
 
 /*!
  * \brief create an empty sparse NDArray with specified shape and data type
+ * \param storage_type the storage type of the ndarray
+ * \param shape the pointer to the shape
+ * \param ndim the dimension of the shape
+ * \param dev_type device type, specify device we want to take
+ * \param dev_id the device id of the specific device
+ * \param delay_alloc whether to delay allocation until
+ *        the narray is first mutated
+ * \param dtype data type of created array
+ * \param num_aux the number of aux data to support this ndarray
+ * \param aux_type data type of the aux data for the created array
+ * \param aux_ndims the dimension of the shapes of aux data
+ * \param aux_shape the shapes of aux data
+ * \param out the returning handle
+ * \return 0 when success, -1 when failure happens
  */
 MXNET_DLL int MXNDArrayCreateSparseEx(int storage_type,
                     const mx_uint *shape,
@@ -269,7 +268,9 @@ MXNET_DLL int MXNDArrayCreateSparseEx(int storage_type,
                     int delay_alloc,
                     int dtype,
                     mx_uint num_aux,
-                    int *aux_types,
+                    int *aux_type,
+                    mx_uint *aux_ndims,
+                    const mx_uint *aux_shape,
                     NDArrayHandle *out);
 
 /*!
@@ -439,13 +440,26 @@ MXNET_DLL int MXNDArrayGetData(NDArrayHandle handle,
  */
 MXNET_DLL int MXNDArrayGetDType(NDArrayHandle handle,
                                int *out_dtype);
-// Get the aux type for ith aux data
+
+/*!
+ * \brief get the type of the ith aux data in NDArray
+ * \param handle the handle to the narray
+ * \param i the index of the aux data
+ * \param out_type pointer holder to get type of aux data
+ * \return 0 when success, -1 when failure happens
+ */
 MXNET_DLL int MXNDArrayGetAuxType(NDArrayHandle handle,
                                   mx_uint i,
-                                  int *out_aux_type);
-// Get the num of aux data to help store sparse NDArray
-MXNET_DLL int MXNDArrayGetNumAux(NDArrayHandle handle,
-                                 mx_uint *out_num_aux);
+                                  int *out_type);
+
+// Get the ith aux data blob wrapped in an NDArray
+MXNET_DLL int MXNDArrayGetAuxNDArray(NDArrayHandle handle,
+                                     mx_uint i,
+                                     NDArrayHandle *out);
+
+// Get the data blob wrapped in an NDArray
+MXNET_DLL int MXNDArrayGetDataNDArray(NDArrayHandle handle,
+                                      NDArrayHandle *out);
 /*!
  * \brief get the context of the NDArray
  * \param handle the handle to the narray
diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h
index 86a6afc4d8e9..71fc78527707 100644
--- a/include/mxnet/ndarray.h
+++ b/include/mxnet/ndarray.h
@@ -15,6 +15,7 @@
 #include <vector>
 #include <map>
 #include <string>
+#include <typeinfo>
 #include <memory>
 #include "./base.h"
 #include "./storage.h"
@@ -28,8 +29,22 @@
 #endif
 
 namespace mxnet {
-// forward declaration
+// forward declarations
 class NDArray;
+
+namespace op {
+template<typename xpu>
+void FillZerosRspImpl(mshadow::Stream<xpu> *s, NDArray *dst);
+
+template<typename xpu>
+void CastStorageComputeImpl(mshadow::Stream<xpu> *s, const NDArray& input, const NDArray& output);
+};
+
+namespace ndarray {
+template<typename from_xpu, typename to_xpu>
+void Copy(const TBlob &from, TBlob *to, Context from_ctx, Context to_ctx, RunContext ctx);
+};
+
 namespace autograd {
 class AGNode;
 
@@ -72,19 +87,6 @@ enum NDArrayStorageType {
   kCSRStorage,             // csr
 };
 
-/*!
- * \brief issue an copy operation from one NDArray to another
- *  the two ndarray can sit on different devices
- *  this operation will be scheduled by the engine
- *
- * \param from the ndarray we want to copy data from
- * \param to the target ndarray
- * \param priority Priority of the action.
- * \param alloc_output whether to allocate memory for the output ndarray
- * \note The function name explicitly marks the order of from and to
- *     due to different possible convention carried by copy function.
- */
-void CopyFromTo(const NDArray &from, NDArray *to, int priority = 0, bool alloc_output = true);
 
 /*!
  * \brief ndarray interface
@@ -116,7 +118,8 @@ class NDArray {
    */
   NDArray(const NDArrayStorageType storage_type, const TShape &shape, Context ctx,
           bool delay_alloc = true, int dtype = mshadow::default_type_flag,
-          std::vector<int> aux_types = {})
+          std::vector<int> aux_types = {}, std::vector<TShape> aux_shapes = {},
+          TShape storage_shape = TShape({0}))
       : shape_(shape), offset_(0), dtype_(dtype), entry_({nullptr, 0, 0}) {
       // Assign default aux types if not given
       if (aux_types.size() == 0) {
@@ -128,7 +131,30 @@ class NDArray {
           LOG(FATAL) << "Unknown storage type";
         }
       }
-      ptr_ = std::make_shared<Chunk>(ctx, delay_alloc, aux_types, storage_type);
+      // Assign default shapes if not given
+      // unknown shapes are intialized as {0} such that Size() would return 0
+      if (aux_shapes.size() == 0) {
+        if (storage_type == kRowSparseStorage) {
+          aux_shapes = {TShape({0})};
+        } else if (storage_type == kCSRStorage) {
+          // aux shapes for indptr and indices
+          aux_shapes = {TShape({0}), TShape({0})};
+        } else {
+          LOG(FATAL) << "Unknown storage type";
+        }
+      }
+      if (storage_shape.Size() == 0) {
+        if (storage_type == kRowSparseStorage) {
+          storage_shape = shape;
+          storage_shape[0] = aux_shapes[rowsparse::kIdx][0];
+        } else if (storage_type == kCSRStorage) {
+          storage_shape = aux_shapes[csr::kIdx];
+        } else {
+          LOG(FATAL) << "Unknown storage type";
+        }
+      }
+      ptr_ = std::make_shared<Chunk>(storage_type, storage_shape, ctx, delay_alloc,
+                                     dtype, aux_types, aux_shapes);
 #if MKL_EXPERIMENTAL == 1
       Mkl_mem_ = std::make_shared<MKLMemHolder>();
 #endif
@@ -149,14 +175,6 @@ class NDArray {
       Mkl_mem_ = std::make_shared<MKLMemHolder>();
 #endif
   }
-  NDArray(NDArray data, const std::vector<NDArray> aux_data, Context ctx,
-          NDArrayStorageType storage_type, const TShape &shape)
-      : ptr_(std::make_shared<Chunk>(data, aux_data, ctx, storage_type)), shape_(shape),
-        offset_(0), dtype_(data.data().type_flag_), entry_({nullptr, 0, 0}) {
-#if MKL_EXPERIMENTAL == 1
-      Mkl_mem_ = std::make_shared<MKLMemHolder>();
-#endif
-  }
 
   /*!
    * \return the shape of current NDArray.
@@ -195,7 +213,6 @@ class NDArray {
   // TODO(haibin) CamelCase
   inline const TShape aux_shape(size_t i) const {
     CHECK(storage_type() != kDefaultStorage);
-    if (i >= ptr_->aux_shapes.size()) return TShape();
     return ptr_->aux_shapes[i];
   }
 
@@ -217,14 +234,19 @@ class NDArray {
     CHECK(ptr_ != nullptr);
     TBlob res;
     TShape shape = shape_;
-    if (storage_type() != kDefaultStorage) {
-      CHECK(offset_ == 0) << "Non-default storage should never set offset_";
-      shape = storage_shape();
-    }
+    auto stype = storage_type();
     MSHADOW_TYPE_SWITCH(dtype(), DType, {
-      CHECK(ptr_->shandle.dptr != nullptr);
-      res = TBlob(static_cast<DType*>(ptr_->shandle.dptr)
-        + offset_, shape, ptr_->shandle.ctx.dev_mask(), dtype());
+      auto dptr = static_cast<DType*>(ptr_->shandle.dptr);
+      if (stype == kDefaultStorage) {
+        dptr += offset_;
+      } else if (stype == kCSRStorage) {
+        shape = storage_shape();
+      } else if (stype == kRowSparseStorage) {
+        shape = storage_shape();
+      } else {
+        LOG(FATAL) << "unknown storage type " << stype;
+      }
+      res = TBlob(dptr, shape, ptr_->shandle.ctx.dev_mask(), dtype());
     });
 #if MKL_EXPERIMENTAL == 1
     res.Mkl_mem_ = Mkl_mem_;
@@ -235,12 +257,23 @@ class NDArray {
    * \return the aux TBlob
    */
   inline TBlob aux_data(size_t i) const {
-    CHECK(storage_type() != kDefaultStorage);
+    auto stype = storage_type();
     TBlob res;
-    CHECK(i < ptr_->aux_handles.size());
-    MSHADOW_TYPE_SWITCH(aux_type(i), DType, {
-      res = TBlob(static_cast<DType*>(ptr_->aux_handles[i].dptr), aux_shape(i),
-                  ptr_->aux_handles[i].ctx.dev_mask(), aux_type(i));
+    auto shape = aux_shape(i);
+    auto type = aux_type(i);
+    MSHADOW_TYPE_SWITCH(type, DType, {
+      auto dptr = static_cast<DType*>(ptr_->aux_handles[i].dptr);
+      if (stype == kRowSparseStorage) {
+        CHECK_EQ(offset_, 0);
+      } else if (stype == kCSRStorage) {
+        if (i == csr::kIndPtr) {
+          dptr += offset_;
+          shape[0] = shape_[0] + 1;
+        }
+      } else {
+        LOG(FATAL) << "Unexpected storage type";
+      }
+      res = TBlob(dptr, shape, ptr_->aux_handles[i].ctx.dev_mask(), type);
     });
 #if MKL_EXPERIMENTAL == 1
     res.Mkl_mem_ = Mkl_mem_;
@@ -284,14 +317,22 @@ class NDArray {
     if (is_none()) return kUndefinedStorage;
     return ptr_->storage_type;
   }
-  inline size_t num_aux() const {
-    if (is_none()) return 0;
-    return ptr_->aux_handles.size();
-  }
   /*! \return whether this ndarray is not initialized */
   inline bool is_none() const {
     return ptr_.get() == nullptr;
   }
+  // returns true if a sparse ndarray's aux_data and storage are initialized
+  inline bool storage_initialized() const {
+    if (is_none()) return false;
+    auto stype = storage_type();
+    CHECK_NE(stype, kDefaultStorage);
+    if (stype == kRowSparseStorage || stype == kCSRStorage) {
+      return aux_shape(0).Size() != 0;
+    } else {
+      LOG(FATAL) << "Unknown storage type";
+    }
+    return true;
+  }
   /*!
    * \brief Block until all the pending write operations with respect
    *    to current NDArray are finished, and read can be performed.
@@ -425,19 +466,28 @@ class NDArray {
   void SyncCopyToCPU(void *data, size_t size) const;
   /*!
    * \brief Slice a NDArray
-   * \param begin begin index in first dim
-   * \param end end index in first dim
+   * \param begin begin index in first dim (inclusive)
+   * \param end end index in first dim (exclusive)
    * \return sliced NDArray
    */
   inline NDArray Slice(index_t begin, index_t end) const {
     NDArray ret = *this;
     CHECK(!is_none()) << "NDArray is not initialized";
     CHECK_GE(shape_[0], end) << "Slice end index out of range";
-    CHECK(storage_type() == kDefaultStorage) << "Slice not yet implemented for storage "
-                                             << storage_type();
-    size_t length = shape_.ProdShape(1, shape_.ndim());
-    ret.offset_ += begin * length;
-    ret.shape_[0] = end - begin;
+    auto stype = storage_type();
+    if (stype == kDefaultStorage) {
+      size_t length = shape_.ProdShape(1, shape_.ndim());
+      ret.offset_ += begin * length;
+      ret.shape_[0] = end - begin;
+    } else if (stype == kCSRStorage) {
+      // for csr, the offset variable is used to adjust indptr
+      // while getting aux_data, the dptr of indptr is advanced by offset,
+      // and shape for indptr is end - begin + 1
+      ret.offset_ += begin;
+      ret.shape_[0] = end - begin;
+    } else {
+      LOG(FATAL) << "Slice not yet implemented for storage " << stype;
+    }
     return ret;
   }
   /*!
@@ -460,6 +510,19 @@ class NDArray {
     }
     return ret;
   }
+  // Wrap the tblob of aux data into an NDArray which shares the same variable with the
+  // current one.
+  inline const NDArray AuxNDArray(size_t i) const {
+    CHECK_NE(storage_type(), kDefaultStorage);
+    CHECK(i < ptr_->aux_shapes.size());
+    return NDArray(aux_data(i), ctx().dev_id, var());
+  }
+  // Wrap the tblob of data into an NDArray which shares the same variable with the
+  // current one.
+  inline const NDArray DataNDArray() const {
+    CHECK_NE(storage_type(), kDefaultStorage);
+    return NDArray(data(), ctx().dev_id, var());
+  }
   /*!
    * \brief Create a NDArray that shares memory with current one
    *  The new array must have smaller memory size than the current array.
@@ -562,7 +625,8 @@ class NDArray {
      */
     /*! \brief construct from static data */
     bool static_data;
-    /*! \brief whether allocation is delayed. */
+    /*! \brief whether data allocation is delayed. This doesn't indicate whether aux data
+               allocation is delayed. */
     bool delay_alloc;
     // the type of the storage. The storage_type is never kUndefinedStorage once the chunk
     // is constructed.
@@ -573,8 +637,10 @@ class NDArray {
     Context ctx;
     // The shape of the chunk data.
     // This might not be the same shape as the NDArray, since the storage may be sparse.
+    // The default value for storage_shape is {0} when an empty non-default NDArray is created.
     TShape storage_shape;
-    // The shape of aux data. The default value for the shape is 0.
+    // The shape of aux data. The default value for the shape depends on the type of storage.
+    // If aux_shapes[i].Size() is zero, aux data i is empty.
     std::vector<TShape> aux_shapes;
     // \brief skip the deletion of var handle. Usually set when shared_var is present.
     bool skip_delete_var = false;
@@ -589,44 +655,6 @@ class NDArray {
       shandle.ctx = ctx_;
       if (!delay_alloc_) this->CheckAndAlloc();
     }
-    // construct a chunk by copying over data
-    Chunk(const NDArray &nd, const std::vector<NDArray> &nd_aux, Context ctx_,
-          NDArrayStorageType storage_type_)
-        : static_data(false), delay_alloc(false), storage_type(storage_type_), ctx(ctx_) {
-      // Vars
-      var = Engine::Get()->NewVariable();
-      // Data Storage
-      const auto &data = nd.data();
-      storage_shape = data.shape_;
-      shandle.ctx = ctx;
-      shandle.size = data.shape_.Size() * mshadow::mshadow_sizeof(data.type_flag_);
-      shandle = Storage::Get()->Alloc(shandle.size, shandle.ctx);
-
-      // Copy data
-      // Single threaded copy may not saturate memory bandwidth
-      CHECK_EQ(nd.storage_type(), kDefaultStorage);
-      auto data_blob = TBlob(shandle.dptr, storage_shape, shandle.ctx.dev_mask(), data.type_flag_);
-      NDArray data_wrapper(data_blob, ctx.dev_id, var);
-      CopyFromTo(nd, &data_wrapper, 0, false);
-
-      // Aux shapes, types and storage
-      CHECK_GT(storage_shape.ndim(), 0);
-      for (size_t i = 0; i < nd_aux.size(); i++) {
-        const auto &aux_d = nd_aux[i].data();
-        aux_shapes.emplace_back(aux_d.shape_);
-        aux_types.emplace_back(aux_d.type_flag_);
-        Storage::Handle aux_handle;
-        aux_handle.ctx = ctx;
-        aux_handle.size = aux_shapes[i].Size() * mshadow::mshadow_sizeof(aux_types[i]);
-        aux_handle = Storage::Get()->Alloc(aux_handle.size, aux_handle.ctx);
-        aux_handles.emplace_back(aux_handle);
-        // Copy aux data
-        CHECK_EQ(nd_aux[i].storage_type(), kDefaultStorage);
-        TBlob aux_blob(aux_handle.dptr, aux_shapes[i], ctx.dev_mask(), aux_types[i]);
-        NDArray aux_wrapper(aux_blob, ctx.dev_id, var);
-        CopyFromTo(nd_aux[i], &aux_wrapper, 0, false);
-      }
-    }
 
     Chunk(const TBlob &data, int dev_id, Engine::VarHandle shared_var)
         : static_data(true), delay_alloc(false) {
@@ -651,15 +679,20 @@ class NDArray {
       shandle.size = data.shape_.Size() * mshadow::mshadow_sizeof(data.type_flag_);
       storage_shape = data.shape_;
     }
-    Chunk(Context ctx_, bool delay_alloc_, std::vector<int> aux_types_,
-          NDArrayStorageType storage_type_)
+    // Constructor for a non-default storage chunk
+    Chunk(NDArrayStorageType storage_type_, const TShape &storage_shape_, Context ctx_,
+          bool delay_alloc_, int dtype, const std::vector<int> &aux_types_,
+          const std::vector<TShape> &aux_shapes_)
         : static_data(false), delay_alloc(delay_alloc_), storage_type(storage_type_),
-          aux_types(aux_types_), ctx(ctx_) {
+          aux_types(aux_types_), ctx(ctx_), storage_shape(storage_shape_),
+          aux_shapes(aux_shapes_) {
       var = Engine::Get()->NewVariable();
-      // Assume alloc is always delayed for non-default storage type
-      CHECK(delay_alloc_);
-      if (!delay_alloc_) {
-        this->CheckAndAlloc();
+      // aux_handles always reflect the correct number of aux data
+      for (size_t i = 0; i < aux_shapes.size(); i++) {
+        CheckAndAllocAuxData(i, aux_shapes[i]);
+      }
+      if (!delay_alloc) {
+        CheckAndAllocData(storage_shape, dtype);
       }
     }
     /*! \brief check if delay alloc is on, do alloc if not yet done */
@@ -672,55 +705,63 @@ class NDArray {
     inline void CheckAndAlloc(const TShape &shape, const std::vector<TShape> &aux_shapes,
                               int dtype) {
       // calculate size, perform allocation
-      if (delay_alloc) {
-        if (kRowSparseStorage == storage_type) {
-          // For row sparse, aux_shape indicates the number of rows to allocate
-          auto aux_shape = aux_shapes[rowsparse::kIdx];
-          CHECK_EQ(shape.ndim(), 2) << "High dim RowSparse not yet implemented";
-          CheckAndAllocAuxData(rowsparse::kIdx, aux_shape);
-          TShape storage_shape(shape);
-          storage_shape[0] = aux_shape[0];
-          CheckAndAllocData(storage_shape, dtype);
-        } else if (kCSRStorage == storage_type) {
-          CheckAndAllocAuxData(csr::kIndPtr, aux_shapes[csr::kIndPtr]);
-          CheckAndAllocAuxData(csr::kIdx, aux_shapes[csr::kIdx]);
-          CheckAndAllocData(aux_shapes[csr::kIdx], dtype);
-        } else {
-          LOG(FATAL) << "Storage type " << storage_type << " not implemented for CheckAndAlloc";
-        }
+      if (kRowSparseStorage == storage_type) {
+        // For row sparse, aux_shape indicates the number of rows to allocate
+        auto aux_shape = aux_shapes[rowsparse::kIdx];
+        CHECK_EQ(shape.ndim(), 2) << "High dim RowSparse not yet implemented";
+        CheckAndAllocAuxData(rowsparse::kIdx, aux_shape);
+        TShape storage_shape(shape);
+        storage_shape[0] = aux_shape[0];
+        CheckAndAllocData(storage_shape, dtype);
+      } else if (kCSRStorage == storage_type) {
+        CheckAndAllocAuxData(csr::kIndPtr, aux_shapes[csr::kIndPtr]);
+        CheckAndAllocAuxData(csr::kIdx, aux_shapes[csr::kIdx]);
+        CheckAndAllocData(aux_shapes[csr::kIdx], dtype);
+      } else {
+        LOG(FATAL) << "Storage type " << storage_type << " not implemented for CheckAndAlloc";
       }
     }
     // create storage handle for data based on shape and dtype, assuming ctx is set
-    // shandle and storage shape are updated
+    // storage shape is also updated
+    // if data is already allocated, try reuse the storage. Otherwise, free the current one
+    // and allocate new storage
     inline void CheckAndAllocData(const TShape &shape, int dtype) {
       CHECK_NE(aux_shapes.size(), 0) << "data is expected to be allocated after aux_data";
+      auto dbytes = shape.Size() * mshadow::mshadow_sizeof(dtype);
+      if (shandle.size < dbytes) {
+        // free storage if necessary and alloc again
+        if (shandle.size > 0) Storage::Get()->Free(shandle);
+        // init storage
+        shandle = Storage::Get()->Alloc(dbytes, ctx);
+      }
       // init shape
       storage_shape = shape;
-      // init storage
-      auto dbytes = shape.Size() * mshadow::mshadow_sizeof(dtype);
-      shandle = Storage::Get()->Alloc(dbytes, ctx);
       // delay_alloc is only set when data storage handle is present
       delay_alloc = false;
     }
-    // create storage handle for aux data based on shape, assuming ctx and aux type are set
-    // aux_handle and aux shape are updated
+    // create storage handle for aux data based on shape
+    // this function assumes ctx, aux shapes and aux types are set
+    // aux shape is also updated
+    // if aux data is already allocated, try reuse the storage. Otherwise, free the current one
+    // and allocate new storage
     inline void CheckAndAllocAuxData(size_t i, const TShape &shape) {
-      CHECK_GT(shape.Size(), 0) << "shape cannot be empty in CheckAndAllocAuxData";
       CHECK_EQ(shape.ndim(), 1) << "shape must be 1D in CheckAndAllocAuxData";
-      CHECK_EQ(aux_shapes.size(), aux_handles.size());
       CHECK_NE(storage_type, kUndefinedStorage)
         << "storage type cannot be kUndefinedStorage in CheckAndAllocAuxData";
       CHECK_NE(storage_type, kDefaultStorage)
         << "storage type cannot be kDefaultStorage in CheckAndAllocAuxData";
-      if (aux_shapes.size() <= i) {
-        aux_shapes.resize(i + 1);
+      if (aux_handles.size() <= i) {
         aux_handles.resize(i + 1);
       }
+      size_t aux_bytes = shape.Size() * mshadow::mshadow_sizeof(aux_types[i]);
+      if (aux_handles[i].size < aux_bytes) {
+        // free storage if necessary and alloc again
+        if (aux_handles[i].size > 0) Storage::Get()->Free(aux_handles[i]);
+        // init aux storage
+        aux_handles[i] = Storage::Get()->Alloc(aux_bytes, ctx);
+      }
       // init shape
       aux_shapes[i] = shape;
-      // Init aux storage
-      size_t aux_bytes = shape.Size() * mshadow::mshadow_sizeof(aux_types[i]);
-      aux_handles[i] = Storage::Get()->Alloc(aux_bytes, ctx);
     }
     /*! \brief destructor */
     ~Chunk() {
@@ -747,13 +788,95 @@ class NDArray {
   /*! \brief shape of current NDArray */
   TShape shape_;
   /*! \brief offset in chunk */
-  size_t offset_;
+  size_t offset_ = 0;
   /*! \brief type of data */
   int dtype_ = -1;
   /*! \brief node entry for autograd */
   autograd::AGNodeEntry entry_;
 };
 
+/*!
+ * \brief issue an copy operation from one NDArray to another
+ *  the two ndarray can sit on different devices
+ *  this operation will be scheduled by the engine
+ *
+ * \param from the ndarray we want to copy data from
+ * \param to the target ndarray
+ * \param priority Priority of the action.
+ * \param alloc_output whether to allocate memory for the output ndarray
+ * \note The function name explicitly marks the order of from and to
+ *     due to different possible convention carried by copy function.
+ */
+void CopyFromTo(const NDArray &from, NDArray *to, int priority = 0, bool alloc_output = true);
+
+// Make a copy of a row-sparse NDArray
+template<typename from_xpu, typename to_xpu>
+inline void CopyFromToRspImpl(const NDArray from, NDArray *to, RunContext ctx, bool alloc) {
+  using namespace mshadow;
+  CHECK_EQ(from.storage_type(), to->storage_type()) << "Copying with different storage type";
+  // if source is zeros, fill destination with zeros, too
+  auto s = ctx.get_stream<to_xpu>();
+  if (!from.storage_initialized()) {
+    op::FillZerosRspImpl<to_xpu>(s, to);
+    return;
+  }
+  auto aux_shape = from.aux_shape(rowsparse::kIdx);
+  if (alloc) to->CheckAndAlloc({aux_shape});
+  TBlob val = to->data();
+  TBlob idx = to->aux_data(rowsparse::kIdx);
+  ndarray::Copy<from_xpu, to_xpu>(from.data(), &val,
+                                  from.ctx(), to->ctx(), ctx);
+  ndarray::Copy<from_xpu, to_xpu>(from.aux_data(rowsparse::kIdx), &idx,
+                                  from.ctx(), to->ctx(), ctx);
+}
+
+// Make a copy of a dense NDArray
+template<typename from_xpu, typename to_xpu>
+inline void CopyFromToDnsImpl(const NDArray from, NDArray *to, RunContext ctx, bool alloc) {
+  using namespace mshadow;
+  CHECK_EQ(from.storage_type(), to->storage_type()) << "Copying with different storage type";
+  if (alloc) to->CheckAndAlloc();
+  TBlob tmp = to->data();
+  ndarray::Copy<from_xpu, to_xpu>(from.data(), &tmp,
+                                  from.ctx(), to->ctx(), ctx);
+  auto &gpu_tid = typeid(mshadow::gpu);
+  if (typeid(from_xpu) == gpu_tid || typeid(to_xpu) == gpu_tid) {
+    // Wait GPU kernel to complete
+    ctx.get_stream<gpu>()->Wait();
+  }
+}
+
+// Make a copy of an NDArray based on storage type
+template<typename from_xpu, typename to_xpu>
+void CopyFromToImpl(const NDArray from, NDArray *to, RunContext ctx, bool alloc) {
+  // if storage type doesn't match, cast the storage first
+  auto from_stype = from.storage_type();
+  auto to_stype = to->storage_type();
+  NDArray casted_nd;
+  if (from_stype != to_stype) {
+    TShape shape = from.shape();
+    auto from_ctx = from.ctx();
+    auto s = ctx.get_stream<from_xpu>();
+    // TODO(haibin) inplace conversion
+    if (to_stype == kDefaultStorage) {
+      casted_nd = NDArray(shape, from_ctx);
+    } else {
+      casted_nd = NDArray(to_stype, shape, from_ctx);
+    }
+    op::CastStorageComputeImpl<from_xpu>(s, from, casted_nd);
+  } else {
+    casted_nd = from;
+  }
+  if (to_stype == kDefaultStorage) {
+    CopyFromToDnsImpl<from_xpu, to_xpu>(casted_nd, to, ctx, alloc);
+  } else if (to_stype == kRowSparseStorage) {
+    CopyFromToRspImpl<from_xpu, to_xpu>(casted_nd, to, ctx, alloc);
+  } else {
+    // TODO(haibin) support csr copy. For sliced csr, we want to only copy the related
+    // indices and values instead of the superset.
+    LOG(FATAL) << "Not implemented yet";
+  }
+}
 
 /*!
  * \brief Perform elementwise sum over each data from source, store result into out.
diff --git a/include/mxnet/storage.h b/include/mxnet/storage.h
index 1b765233947d..e236a9cf313b 100644
--- a/include/mxnet/storage.h
+++ b/include/mxnet/storage.h
@@ -23,11 +23,11 @@ class Storage {
     /*!
      * \brief Pointer to the data.
      */
-    void* dptr;
+    void* dptr{nullptr};
     /*!
      * \brief Size of the storage.
      */
-    size_t size;
+    size_t size{0};
     /*!
      * \brief Context information about device and ID.
      */
diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer.py
index 124f1f6218ee..fa2c6343f7e5 100644
--- a/python/mxnet/optimizer.py
+++ b/python/mxnet/optimizer.py
@@ -339,10 +339,10 @@ def update(self, index, weight, grad, state):
 
         if state is not None:
             sparse_sgd_mom_update(weight, grad, state, out=weight,
-                           lr=lr, wd=wd, **self.kwargs)
+                                  lr=lr, wd=wd, **self.kwargs)
         else:
             sparse_sgd_update(weight, grad, out=weight,
-                       lr=lr, wd=wd, **self.kwargs)
+                              lr=lr, wd=wd, **self.kwargs)
 
 
 @register
diff --git a/python/mxnet/sparse_ndarray.py b/python/mxnet/sparse_ndarray.py
index be726eac53e6..f8788744a312 100644
--- a/python/mxnet/sparse_ndarray.py
+++ b/python/mxnet/sparse_ndarray.py
@@ -1,23 +1,21 @@
 # coding: utf-8
-# pylint: disable= too-many-lines, redefined-builtin, protected-access
-# pylint: disable=import-error, no-name-in-module, undefined-variable
-"""NDArray API of mxnet."""
+"""SparseNDArray API of mxnet."""
 from __future__ import absolute_import
 from __future__ import division
-# try:
-#    from __builtin__ import slice as py_slice
-# except ImportError:
-#    from builtins import slice as py_slice
+try:
+    from __builtin__ import slice as py_slice
+except ImportError:
+    from builtins import slice as py_slice
 
 import ctypes
-# import warnings
+import warnings
 
 import os as _os
 import sys as _sys
 
 # import operator
 import numpy as np
-from .base import _LIB  # , string_types, numeric_types
+from .base import _LIB, numeric_types #string_types
 from .base import c_array, mx_real_t  # , py_str, c_str
 from .base import mx_uint, NDArrayHandle, check_call
 # from .base import ctypes2buffer
@@ -25,7 +23,7 @@
 from . import _ndarray_internal as _internal
 from . import ndarray
 from .ndarray import _DTYPE_NP_TO_MX, _DTYPE_MX_TO_NP
-from .ndarray import _STORAGE_TYPE_ID_TO_STR, _STORAGE_TYPE_STR_TO_ID
+from .ndarray import _STORAGE_TYPE_STR_TO_ID#, _STORAGE_TYPE_ID_TO_STR
 from .ndarray import NDArray
 
 # Use different verison of SymbolBase
@@ -48,9 +46,8 @@
 }
 
 
-def _new_alloc_handle(storage_type, shape, ctx, delay_alloc=True,
-                      dtype=mx_real_t, aux_types=None):
-    """Return a new handle with specified shape and context.
+def _new_alloc_handle(storage_type, shape, ctx, delay_alloc, dtype, aux_types, aux_shapes=None):
+    """Return a new handle with specified shape, type and context.
 
     Empty handle is only used to hold results
 
@@ -60,7 +57,10 @@ def _new_alloc_handle(storage_type, shape, ctx, delay_alloc=True,
         A new empty ndarray handle
     """
     hdl = NDArrayHandle()
-    aux_type_list = [int(_DTYPE_NP_TO_MX[np.dtype(aux_t).type]) for aux_t in aux_types]
+    aux_type_ids = [int(_DTYPE_NP_TO_MX[np.dtype(aux_t).type]) for aux_t in aux_types]
+    aux_shapes = [(0,) for aux_t in aux_types] if aux_shapes is None else aux_shapes
+    aux_shape_lens = [len(aux_shape) for aux_shape in aux_shapes]
+    aux_shapes = sum(aux_shapes, ())
     num_aux = mx_uint(len(aux_types))
     check_call(_LIB.MXNDArrayCreateSparseEx(
         ctypes.c_int(int(_STORAGE_TYPE_STR_TO_ID[storage_type])),
@@ -71,18 +71,22 @@ def _new_alloc_handle(storage_type, shape, ctx, delay_alloc=True,
         ctypes.c_int(int(delay_alloc)),
         ctypes.c_int(int(_DTYPE_NP_TO_MX[np.dtype(dtype).type])),
         num_aux,
-        c_array(ctypes.c_int, aux_type_list),
+        c_array(ctypes.c_int, aux_type_ids),
+        c_array(mx_uint, aux_shape_lens),
+        c_array(mx_uint, aux_shapes),
         ctypes.byref(hdl)))
     return hdl
 
 
 class SparseNDArray(NDArray):
-    ''' sparse ndarray '''
+    """An array object representing a multidimensional, homogeneous array of
+fixed-size items, stored in sparse format.
+
+    """
     __slots__ = []
 
-    # def __repr__(self):
-    def __reduce__(self):
-        return SparseNDArray, (None,), self.__getstate__()
+    #def __reduce__(self):
+    #    return SparseNDArray, (None,), self.__getstate__()
 
     def __add__(self, other):
         raise Exception('Not implemented for SparseND yet!')
@@ -163,16 +167,109 @@ def __setstate__(self, state):
         raise Exception('Not implemented for SparseND yet!')
 
     def __setitem__(self, key, value):
-        raise Exception('Not implemented for SparseND yet!')
+        """x.__setitem__(i, y) <=> x[i]=y
+
+        Set self[key] to value.
+
+        Parameters
+        ----------
+        key : slice
+            The indexing key.
+        value : NDArray or numpy.ndarray
+            The value to set.
+
+        """
+        if not self.writable:
+            raise ValueError('Failed to assign to a readonly NDArray')
+        if isinstance(key, py_slice):
+            if key.step is not None or key.start is not None or key.stop is not None:
+                raise ValueError('Assignment with slicing not supported in SparseNDArray.')
+            if isinstance(value, NDArray):
+                # avoid copying to itself
+                if value.handle is not self.handle:
+                    value.copyto(self)
+            elif isinstance(value, numeric_types):
+                raise Exception("Assigning numeric types to SparseNDArray not supported yet.")
+            elif isinstance(value, (np.ndarray, np.generic)):
+                # TODO(haibin) this is not efficient. Implement sync_copyfrom for
+                # sparse ndarray to avoid an extra copy
+                warnings.warn('Assigning non-NDArray object to SparseNDArray is not efficient',
+                              RuntimeWarning)
+                tmp = ndarray.array(value)
+                tmp.copyto(self)
+            else:
+                raise TypeError('type %s not supported' % str(type(value)))
+        else:
+            assert(isinstance(key, (int, tuple)))
+            raise Exception('SparseNDArray only supports [:] for assignment')
 
     def __getitem__(self, key):
-        raise Exception('Not implemented for SparseND yet!')
+        stype = self.storage_type
+        assert(stype == 'csr'), "getitem for " + str(stype) + " not implemented yet"
+        if isinstance(key, int):
+            raise Exception("Not implemented yet")
+        if isinstance(key, py_slice):
+            if key.step is not None:
+                raise ValueError('NDArray only supports continuous slicing on axis 0')
+            if key.start is not None or key.stop is not None:
+                return self._slice(key.start, key.stop)
+            else:
+                return self
+        if isinstance(key, tuple):
+            raise ValueError('Multi-dimension indexing is not supported')
 
     def _sync_copyfrom(self, source_array):
         raise Exception('Not implemented for SparseND yet!')
 
     def _slice(self, start, stop):
-        raise Exception('Not implemented for SparseND yet!')
+        """Returns a read-only sliced SparseNDArray that shares memory with current one.
+        For csr SparseNDArray, it only slices the indptr array, and keeps the original values
+        and indices.
+
+        The existing slice operation is not very efficient when it's copied, since the indices
+        and values are a superset of the sliced region.
+
+
+        Parameters
+        ----------
+        start : int
+            Starting index of slice.
+        stop : int
+            Finishing index of slice.
+
+        Example
+        ----------
+        >>> indptr = np.array([0, 2, 3, 6])
+        >>> indices = np.array([0, 2, 2, 0, 1, 2])
+        >>> data = np.array([1, 2, 3, 4, 5, 6])
+        >>> a = mx.sparse_nd.csr(data, indptr, indices, (3, 3))
+        >>> a.asnumpy()
+        array([[1, 0, 2],
+               [0, 0, 3],
+               [4, 5, 6]])
+
+        >>> a[1:2].asnumpy()
+        array([[0, 0, 3]])
+
+        >>> a[1:2].indptr.asnumpy()
+        array([[2, 3]])
+
+        >>> a[1:2].indicies.asnumpy()
+        array([0, 2, 2, 0, 1, 2])
+
+        >>> a[1:2].values.asnumpy()
+        array([1, 2, 3, 4, 5, 6])
+
+        """
+        stype = self.storage_type
+        assert(stype == 'csr'), "_slice for " + str(stype) + " not implemented yet"
+        warnings.warn('slicing SparseNDArray is not efficient', RuntimeWarning)
+        handle = NDArrayHandle()
+        start = mx_uint(start) if start else mx_uint(0)
+        stop = mx_uint(stop) if stop else mx_uint(self.shape[0])
+        check_call(_LIB.MXNDArraySlice(
+            self.handle, start, stop, ctypes.byref(handle)))
+        return SparseNDArray(handle=handle, writable=False)
 
     def _at(self, idx):
         raise Exception('at operator for SparseND is not supported.')
@@ -183,200 +280,311 @@ def reshape(self, shape):
     def broadcast_to(self, shape):
         raise Exception('Not implemented for SparseND yet!')
 
-    # def wait_to_read(self):
-    # @property
-    # def shape(self):
-    def aux_type(self, i):
+    def _aux_type(self, i):
+        """Data-type of the array’s ith aux data.
+
+        Returns
+        -------
+        numpy.dtype
+            This NDArray's data type.
+        """
         aux_type = ctypes.c_int()
-        check_call(_LIB.MXNDArrayGetAuxType(
-                   self.handle, i, ctypes.byref(aux_type)))
+        check_call(_LIB.MXNDArrayGetAuxType(self.handle, i, ctypes.byref(aux_type)))
         return _DTYPE_MX_TO_NP[aux_type.value]
 
     @property
-    def size(self):
-        raise Exception('Not implemented for SparseND yet!')
+    def values(self):
+        return self._data(0)
+
+    @property
+    def indices(self):
+        stype = self.storage_type
+        if stype == 'row_sparse':
+            return self._aux_data(0)
+        elif stype == 'csr':
+            return self._aux_data(1)
+        raise Exception("unknown storage type " + stype)
+
+    @property
+    def indptr(self):
+        stype = self.storage_type
+        if stype == 'csr':
+            return self._aux_data(0)
+        raise Exception("unknown storage type " + stype)
 
-    # @property
-    # def context(self):
-    # @property
-    # def dtype(self):
     @property
-    def num_aux(self):
-        num_aux = mx_uint()
-        check_call(_LIB.MXNDArrayGetNumAux(self.handle, ctypes.byref(num_aux)))
-        return num_aux.value
+    def _num_aux(self):
+        ''' The number of aux data used to help store the sparse ndarray.
+        '''
+        return len(_STORAGE_AUX_TYPES[self.storage_type])
+
     @property
     # pylint: disable= invalid-name, undefined-variable
     def T(self):
-        raise Exception('Not implemented for SparseND yet!')
-    # TODO(haibin) Should this be a property?
+        raise Exception('Transpose is not supported for SparseNDArray.')
+
+    @property
     def aux_types(self):
+        ''' The data types of the aux data for the SparseNDArray.
+        '''
         aux_types = []
-        num_aux = self.num_aux
+        num_aux = self._num_aux
         for i in xrange(num_aux):
-            aux_types.append(self.aux_type(i))
+            aux_types.append(self._aux_type(i))
         return aux_types
 
     def asnumpy(self):
         """Return a dense ``numpy.ndarray`` object with value copied from this array
-        """
-        dense_nd = self.to_dense()
-        return dense_nd.asnumpy()
 
-    def asscalar(self):
-        raise Exception('Not implemented for SparseND yet!')
+        """
+        return self.to_dense().asnumpy()
 
     def astype(self, dtype):
         raise Exception('Not implemented for SparseND yet!')
 
     def copyto(self, other):
+        """Copies the value of this array to another array.
+
+        If ``other`` is a ``NDArray`` object, then ``other.shape`` and
+        ``self.shape`` should be the same. This function copies the value from
+        ``self`` to ``other``.
+
+        If ``other`` is a context, a new ``NDArray`` will be first created on
+        the target context, and the value of ``self`` is copied.
+
+        Parameters
+        ----------
+        other : NDArray or Context
+            The destination array or context.
+
+        Returns
+        -------
+        NDArray
+            The copied array. If ``other`` is an ``NDArray``, then the return value
+            and ``other`` will point to the same ``NDArray``.
+       """
         if isinstance(other, NDArray):
             if other.handle is self.handle:
                 warnings.warn('You are attempting to copy an array to itself', RuntimeWarning)
                 return
             return _internal._copyto(self, out=other)
         elif isinstance(other, Context):
-            hret = SparseNDArray(_new_alloc_handle(self.storage_type, self.shape, other, True, self.dtype, self.aux_types()))
+            hret = SparseNDArray(_new_alloc_handle(self.storage_type, self.shape, other,
+                                                   True, self.dtype, self.aux_types))
             return _internal._copyto(self, out=hret)
         else:
             raise TypeError('copyto does not support type ' + str(type(other)))
 
-    def copy(self):
-        raise Exception('Not implemented for SparseND yet!')
-
-    def as_in_context(self, context):
-        raise Exception('Not implemented for SparseND yet!')
-
     def to_dense(self):
         return to_dense(self)
 
+    def _aux_data(self, i, writable=False):
+        """ Get a reference to the i-th aux data associated with the SparseNDArray. If the
+        SparseNDArray is not yet compacted, the returned result may include invalid values.
 
-# TODO We need a to_dense method to test it
-def csr(values, idx, indptr, shape, ctx=Context.default_ctx, dtype=mx_real_t, aux_types=None):
-    ''' constructor '''
-    hdl = NDArrayHandle()
-    # TODO currently only supports NDArray input
-    assert (isinstance(values, NDArray))
-    assert (isinstance(indptr, NDArray))
-    assert (isinstance(idx, NDArray))
-    assert (isinstance(shape, tuple))
-    indices = c_array(NDArrayHandle, [indptr.handle, idx.handle])
-    num_aux = mx_uint(2)
-    check_call(_LIB.MXNDArrayCreateSparse(
-        values.handle, num_aux, indices,
-        c_array(mx_uint, shape),
-        mx_uint(len(shape)),
-        ctypes.c_int(_STORAGE_TYPE_STR_TO_ID['csr']),
-        ctypes.c_int(ctx.device_typeid),
-        ctypes.c_int(ctx.device_id),
-        ctypes.c_int(int(False)),
-        ctypes.c_int(int(_DTYPE_NP_TO_MX[np.dtype(dtype).type])),
-        ctypes.byref(hdl)))
-    return SparseNDArray(hdl)
+        """
+        hdl = NDArrayHandle()
+        check_call(_LIB.MXNDArrayGetAuxNDArray(self.handle, i, ctypes.byref(hdl)))
+        return NDArray(hdl, writable)
 
+    def _data(self, writable=False):
+        """ Get a reference to the data associated with the SparseNDArray. If the
+        SparseNDArray is not yet compacted, the returned result may include invalid values.
 
-# pylint: enable= no-member
-# TODO(haibin) also specify aux_types
-def row_sparse(values, index, shape, ctx=Context.default_ctx, dtype=mx_real_t, aux_types=None):
-    ''' rsp constructor which only accepts NDArray as input '''
-    hdl = NDArrayHandle()
-    assert (isinstance(values, NDArray))
-    assert (isinstance(index, NDArray))
-    indices = c_array(NDArrayHandle, [index.handle])
-    num_aux = mx_uint(1)
-    check_call(_LIB.MXNDArrayCreateSparse(
-        values.handle, num_aux, indices,
-        c_array(mx_uint, shape),
-        mx_uint(len(shape)),
-        ctypes.c_int(_STORAGE_TYPE_STR_TO_ID['row_sparse']),
-        ctypes.c_int(ctx.device_typeid),
-        ctypes.c_int(ctx.device_id),
-        ctypes.c_int(int(False)),
-        ctypes.c_int(int(_DTYPE_NP_TO_MX[np.dtype(dtype).type])),
-        ctypes.byref(hdl)))
-    return SparseNDArray(hdl)
-
-
-def array(values, index_list, storage_type, shape, ctx=None, dtype=mx_real_t, aux_types=None):
-    ''' constructor '''
-    # TODO check input array types. Assume NDArray class for now
-    # TODO support other types
-    # TODO also specify auxtypes
-    assert (storage_type == 'row_sparse' or storage_type == 'csr')
-    if aux_types is not None:
-        assert isinstance(aux_types, list)
-        assert len(aux_types) == len(index_list)
-    if not isinstance(values, NDArray):
-        values = ndarray.array(values)
-    for i, index in enumerate(index_list):
-        if not isinstance(index, NDArray):
-            index_list[i] = ndarray.array(index, dtype=aux_types[i] if aux_types is not None else None)
-
-    if isinstance(shape, int):
-        shape = (shape,)
+        """
+        hdl = NDArrayHandle()
+        check_call(_LIB.MXNDArrayGetDataNDArray(self.handle, ctypes.byref(hdl)))
+        return NDArray(hdl, writable)
+
+
+    def compact(self):
+        raise Exception("Not implemented yet")
+
+def _prepare_src_array(src, dtype, default_dtype):
+    if isinstance(src, NDArray):
+        dtype = src.dtype if dtype is None else dtype
+    else:
+        dtype = default_dtype if dtype is None else dtype
+        if not isinstance(src, np.ndarray):
+            try:
+                src = np.array(src, dtype=dtype)
+            except:
+                raise TypeError('values must be array like object')
+    return src, dtype
+
+def csr(values, indptr, indices, shape, ctx=None, dtype=None, indptr_type=None, indices_type=None):
+    """Creates a 2D array with compressed sparse row format.
+
+    A SparseNDArray with `csr` storage represents a NDArray as three separate arrays: `values`,
+    `indptr` and `indices`. It uses the standard CSR representation where the column indices for
+    row i are stored in indices[indptr[i]:indptr[i+1]] and their corresponding values are stored
+    in values[indptr[i]:indptr[i+1]].
+
+    Parameters
+    ----------
+    values: array_like
+        An object exposing the array interface, with shape [nnz], where D0 is the number of
+        non-zero entries.
+    indptr: array_like
+        An object exposing the array interface, with shape [D0 + 1]. The first element in indptr
+        should always be zero.
+    indices: array_like
+        An object exposing the array interface, with shape [nnz].
+    ctx : Context, optional
+        Device context (default is the current default context).
+    dtype : str or numpy.dtype, optional
+        The data type of the output array. The default dtype is ``values.dtype``
+        if `values` is an `NDArray`, `float32` otherwise.
+    indptr_type: str or numpy.dtype, optional
+        The data type of the indices array. The default dtype is ``indptr.dtype``
+        if `indptr` is an `NDArray`, `int32` otherwise.
+    indices_type: str or numpy.dtype, optional
+        The data type of the indices array. The default dtype is ``indices.dtype``
+        if `indicies` is an `NDArray`, `int32` otherwise.
+
+    Returns
+    -------
+    SparseNDArray
+        An `SparseNDArray` with the `csr` storage representation.
+    """
+    storage_type = 'csr'
+    # context
     if ctx is None:
         ctx = Context.default_ctx
-    if storage_type == 'row_sparse':
-        arr = row_sparse(values, index_list[0], shape, ctx=ctx, dtype=dtype, aux_types=aux_types)
-    elif storage_type == 'csr':
-        arr = csr(values, index_list[0], index_list[1], shape, ctx, dtype, aux_types)
-    else:
-        raise Exception('Not implemented for SparseND yet!')
-    return arr
+    # prepare src array and types
+    values, dtype = _prepare_src_array(values, dtype, mx_real_t)
+    indptr, indptr_type = _prepare_src_array(indptr, indptr_type,
+                                             _STORAGE_AUX_TYPES[storage_type][0])
+    indices, indices_type = _prepare_src_array(indices, indices_type,
+                                               _STORAGE_AUX_TYPES[storage_type][1])
+    # verify types
+    assert('int' in str(indptr_type) or 'long' in str(indptr_type))
+    assert('int' in str(indices_type) or 'long' in str(indices_type))
+    # verify shapes
+    aux_shapes = [indptr.shape, indices.shape]
+    assert(values.ndim == 1)
+    assert(indptr.ndim == 1)
+    assert(indices.ndim == 1)
+    assert(len(shape) == 2)
+    result = SparseNDArray(_new_alloc_handle(storage_type, shape, ctx, False, dtype,
+                                             [indptr_type, indices_type], aux_shapes))
+    # assign indptr, indices and values
+    values_ref = result._data(True)
+    indptr_ref = result._aux_data(0, True)
+    indices_ref = result._aux_data(1, True)
+    values_ref[:] = values
+    indptr_ref[:] = indptr
+    indices_ref[:] = indices
+    return result
+
+def row_sparse(values, indices, shape, ctx=None, dtype=None, indices_type=None):
+    """Creates a row sparse array with a set of tensor slices at given indices.
+
+    A SparseNDArray with `row_sparse` storage is typically used to represent a subset of a larger
+    NDArray  with `default` storage of shape [LARGE0, D1, .. , DN] where LARGE0 >> D0. The values
+    in indices are the indices in the first dimension of the slices that have been extracted from
+    the larger NDArray.
+
+    The corresponding NDArray ``dense`` with `default` storage represented by a ``rsp``
+    SparseNDArray with `row_sparse` storage has
+
+    ``dense[rsp.indices[i], :, :, :, ...] = rsp.values[i, :, :, :, ...]``
+
+    `row_sparse` SparseNDArray is used principally in the definition of gradients for operations
+    that have sparse gradients (e.g. SparseEmbedding).
 
+    Parameters
+    ----------
+    values: array_like
+        An object exposing the array interface, with shape [D0, D1, .. Dn], where D0 is
+        the number of rows with non-zeros entries.
+    indices: array_like
+        An object exposing the array interface, with shape [D0].
+    ctx : Context, optional
+        Device context (default is the current default context).
+    dtype : str or numpy.dtype, optional
+        The data type of the output array. The default dtype is ``values.dtype``
+        if `values` is an `NDArray`, `float32` otherwise.
+    indices_type: str or numpy.dtype, optional
+        The data type of the indices array. The default dtype is ``indices.dtype``
+        if `indicies` is an `NDArray`, `int32` otherwise.
+
+    Returns
+    -------
+    SparseNDArray
+        An `SparseNDArray` with the `row_sparse` storage representation.
+    """
+    storage_type = 'row_sparse'
+    # context
+    if ctx is None:
+        ctx = Context.default_ctx
+    # prepare src array and types
+    values, dtype = _prepare_src_array(values, dtype, mx_real_t)
+    indices, indices_type = _prepare_src_array(indices, indices_type,
+                                               _STORAGE_AUX_TYPES[storage_type][0])
+    # verify types
+    assert('int' in str(indices_type) or 'long' in str(indices_type))
+    # verify shapes
+    assert(values.ndim == len(shape))
+    assert(indices.ndim == 1)
+    result = SparseNDArray(_new_alloc_handle(storage_type, shape, ctx, False, dtype,
+                                             [indices_type], [indices.shape]))
+    # assign indices and values
+    values_ref = result._data(True)
+    indices_ref = result._aux_data(0, True)
+    values_ref[:] = values
+    indices_ref[:] = indices
+    return result
 
 def to_dense(source):
-    return ndarray.cast_storage(source, storage_type='default')
+    """ Return a dense array representation of this SparseNDArray.
 
+    Returns
+    -------
+    SparseNDArray
+        The dense array with default storage
+    """
+    return ndarray.cast_storage(source, storage_type='default')
 
-def zeros(shape, storage_type, ctx=None, dtype=mx_real_t, aux_types=None):
+def zeros(storage_type, shape, ctx=None, dtype=None, aux_types=None):
     """Return a new array of given shape and type, filled with zeros.
 
     Parameters
     ----------
     shape : int or tuple of int
         The shape of the empty array
-    storage_type:
-        'row_sparse', etc
+    storage_type: string
+        The storage type of the empty array, such as 'row_sparse', 'csr', etc
     ctx : Context, optional
         An optional device context (default is the current default context)
     dtype : str or numpy.dtype, optional
         An optional value type (default is `float32`)
-    aux_types:
-        [np.int32], etc
+    aux_types: list of numpy.dtype, optional
+        An optional type for the aux data for SparseNDArray (default values depends
+        on the storage type)
 
     Returns
     -------
-    NDArray
+    SparseNDArray
         A created array
-
-    Examples
-    --------
-    >>> mx.nd.zeros(1).asnumpy()
-    array([ 0.], dtype=float32)
-    >>> mx.nd.zeros((1,2), mx.gpu(0))
-    <NDArray 1x2 @gpu(0)>
-    >>> mx.nd.zeros((1,2), mx.gpu(0), 'float16').asnumpy()
-    array([[ 0.,  0.]], dtype=float16)
     """
     if ctx is None:
         ctx = Context.default_ctx
-    assert (storage_type == 'row_sparse' or storage_type == 'csr')
+
+    dtype = mx_real_t if dtype is None else dtype
     if aux_types is None:
-        if 'row_sparse' == storage_type:
-            aux_types = _STORAGE_AUX_TYPES['row_sparse']
-        elif 'csr' == storage_type:
-            aux_types = _STORAGE_AUX_TYPES['csr']
-    # pylint: disable= no-member, protected-access
-    out = SparseNDArray(_new_alloc_handle(storage_type, shape, ctx,
-                                          aux_types=aux_types))
+        if storage_type == 'row_sparse' or storage_type == 'csr':
+            aux_types = _STORAGE_AUX_TYPES[storage_type]
+        else:
+            raise Exception("unknown storage type")
+    assert(len(aux_types) == len(_STORAGE_AUX_TYPES[storage_type]))
+    out = SparseNDArray(_new_alloc_handle(storage_type, shape, ctx, True, dtype, aux_types))
     return _internal._zeros(shape=shape, ctx=ctx, dtype=dtype, out=out)
-    # pylint: enable= no-member, protected-access
-
 
 _STORAGE_TYPE_TO_ND_CLASS = {
     _STORAGE_TYPE_STR_TO_ID['default']: ndarray.NDArray,
     _STORAGE_TYPE_STR_TO_ID['row_sparse']: SparseNDArray,
     _STORAGE_TYPE_STR_TO_ID['csr']: SparseNDArray,
 }
+
 _init_ndarray_module(_STORAGE_TYPE_TO_ND_CLASS, "mxnet")
diff --git a/python/mxnet/symbol.py b/python/mxnet/symbol.py
index 4beca64c487c..6d473be9cde2 100644
--- a/python/mxnet/symbol.py
+++ b/python/mxnet/symbol.py
@@ -1123,11 +1123,8 @@ def simple_bind(self, ctx,
                 else attrs[k]['__storage_type__'] for k in self.list_arguments()}
         arg_shapes, _, aux_shapes = self.infer_shape(**kwargs)
         arg_types, _, aux_types = self.infer_type(**type_dict)
-        # print(storage_type_dict)
         arg_storage_types, _, _ = \
             self.infer_storage_type(**storage_type_dict)
-        # print("arg_storage_types", arg_storage_types)
-        # print("out_storage_types", out_storage_types)
 
         if arg_shapes is None or arg_types is None:
             raise ValueError("Input node is not complete")
@@ -1147,17 +1144,21 @@ def simple_bind(self, ctx,
         # alloc space
         arg_ndarrays = [
             # avoid allocating dense ndarrays for sparse inputs
-            _nd_zeros(shape, dev, dtype=dtype) if storage_type != 'row_sparse'
-            else _sparse_nd_zeros(shape, storage_type, dev, dtype=dtype)
+            _nd_zeros(shape, dev, dtype=dtype) if storage_type == 'default'
+            else _sparse_nd_zeros(storage_type, shape, dev, dtype=dtype)
             for dtype, dev, shape, storage_type in \
                 zip(arg_types, arg_ctx, arg_shapes, arg_storage_types)]
-        # print(arg_ndarrays)
         if grad_req != 'null':
             grad_ndarrays = {}
             for name, shape, dev, dtype in zip(
                     self.list_arguments(), arg_shapes, arg_ctx, arg_types):
                 if not isinstance(grad_req, dict) or grad_req[name] != 'null':
-                    grad_ndarrays[name] = _nd_zeros(shape, dev, dtype=dtype)
+                    # TODO(haibin) temporarily set gradient stype for embedding op
+                    if name != 'embed_weight':
+                        grad_ndarrays[name] = _nd_zeros(shape, dev, dtype=dtype)
+                    else:
+                        grad_ndarrays[name] = _sparse_nd_zeros('row_sparse', shape,
+                                                               dev, dtype=dtype)
         else:
             grad_ndarrays = None
 
diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py
index fa799313ed6f..60fa7c1792ca 100644
--- a/python/mxnet/test_utils.py
+++ b/python/mxnet/test_utils.py
@@ -9,8 +9,10 @@
 import os
 import errno
 import logging
+import scipy as sp
 import numpy as np
 import numpy.testing as npt
+import numpy.random as rnd
 import mxnet as mx
 from .context import cpu, gpu, Context
 from .ndarray import array
@@ -65,29 +67,37 @@ def random_arrays(*shapes):
     return arrays
 
 # TODO(haibin) also include types in arguments
-def random_sparse_ndarray(shape, storage_type, sparsity = 0.5, allow_zeros = False):
-    """Generate a random sparse ndarray."""
+def rand_sparse_ndarray(shape, storage_type, density=None):
+    """Generate a random sparse ndarray. Returns the ndarray, value(np) and indices(np) """
+    density = rnd.rand() if density is None else density
     if storage_type == 'row_sparse':
         # TODO(haibin) support high dim sparse ndarray
         assert(len(shape) < 3)
         prod = np.prod(shape)
-        # sample idx
-        idx_sample = np.random.rand(shape[0])
-        idx = np.argwhere(idx_sample > sparsity).flatten()
-        if idx.shape[0] == 0:
-            if allow_zeros:
-                return mx.sparse_nd.zeros(shape, 'row_sparse')
-            idx = np.array([0])
-        # generate random values
-        num_rows = idx.shape[0]
         num_cols = long(prod / shape[0])
-        value = np.random.rand(num_rows, num_cols)
-        indices = [idx]
-        arr = mx.sparse_nd.array(value, indices, storage_type, shape, aux_types=[np.int32])
+        # sample index
+        idx_sample = rnd.rand(shape[0])
+        indices = np.argwhere(idx_sample < density).flatten()
+        if indices.shape[0] == 0:
+            return mx.sparse_nd.zeros('row_sparse', shape), (np.array([]), np.array([], dtype='int32'))
+        # generate random values
+        val = rnd.rand(indices.shape[0], num_cols)
+        arr = mx.sparse_nd.row_sparse(val, indices, shape, indices_type=np.int32)
+        return arr, (val, indices)
+    elif storage_type == 'csr':
+        assert(len(shape) == 2)
+        csr = sp.sparse.rand(shape[0], shape[1], density=density, format='csr')
+        result = mx.sparse_nd.csr(csr.data, csr.indptr, csr.indices, shape)
+        return result, (csr.indptr, csr.indices, csr.data)
     else:
-        raise Exception('Not implemented for SparseND yet!')
-    return arr
+        assert(False), "unknown storage type"
 
+def rand_ndarray(shape, storage_type, density=None):
+    if storage_type == 'default':
+        arr = mx.nd.array(random_arrays(shape))
+    else:
+        arr, _ = rand_sparse_ndarray(shape, storage_type, density=density)
+    return arr
 
 def np_reduce(dat, axis, keepdims, numpy_reduce_func):
     """Compatible reduce for old version of NumPy.
@@ -281,7 +291,7 @@ def _parse_location(sym, location, ctx):
 
     Returns
     -------
-    dict of str to np.ndarray
+    dict of str to NDArray
     """
     assert isinstance(location, (dict, list, tuple))
     if isinstance(location, dict):
@@ -592,14 +602,28 @@ def check_symbolic_backward(sym, location, out_grads, expected, rtol=1e-5, atol=
     if isinstance(expected, (list, tuple)):
         expected = {k:v for k, v in zip(sym.list_arguments(), expected)}
     args_grad_npy = {k:_rng.normal(size=v.shape) for k, v in expected.items()}
-    args_grad_data = {k: mx.nd.array(v, ctx=ctx) for k, v in args_grad_npy.items()}
+    # args_grad_data should be casted to storage type if hinted
+    # TODO(haibin) this is a temporary solution for testing. remove later
+    attrs = sym.attr_dict()
+    args_grad_data = {}
+    for k, v in args_grad_npy.items():
+        grad_stype = attrs[k].get('grad_stype_hint', None)
+        nd = mx.nd.array(v, ctx=ctx)
+        if grad_stype is not None:
+            out = mx.nd.cast_storage(nd, storage_type=grad_stype)
+            args_grad_data[k] = out
+        else:
+            args_grad_data[k] = nd
+
     if isinstance(grad_req, str):
         grad_req = {k:grad_req for k in sym.list_arguments()}
     elif isinstance(grad_req, (list, tuple)):
         grad_req = {k:v for k, v in zip(sym.list_arguments(), grad_req)}
 
-    executor = sym.bind(ctx=ctx, args=location, args_grad=args_grad_data, aux_states=aux_states)
+    executor = sym.bind(ctx=ctx, args=location, args_grad=args_grad_data,
+                        aux_states=aux_states, grad_req=grad_req)
     executor.forward(is_train=True)
+
     if isinstance(out_grads, (tuple, list)):
         out_grads = [mx.nd.array(v, ctx=ctx) for v in out_grads]
     elif isinstance(out_grads, (dict)):
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index efef3ed2b972..80b6d3688c02 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -131,31 +131,6 @@ int MXNDArrayCreate(const mx_uint *shape,
   API_END();
 }
 
-// TODO(haibin) remove this API
-int MXNDArrayCreateSparse(NDArrayHandle data,
-                    mx_uint num_aux,
-                    NDArrayHandle *aux_vec,
-                    const mx_uint *shape,
-                    mx_uint ndim,
-                    int storage_type,
-                    int dev_type,
-                    int dev_id,
-                    int delay_alloc,
-                    int dtype,
-                    NDArrayHandle *out) {
-  API_BEGIN();
-  auto ctx = Context::Create(static_cast<Context::DeviceType>(dev_type), dev_id);
-  std::vector<NDArray> aux_ndarrays;
-  NDArray* data_ptr = reinterpret_cast<NDArray*>(data);
-  for (size_t i = 0; i < num_aux; i++) {
-    NDArray* nd_aux_ptr = reinterpret_cast<NDArray*>(aux_vec[i]);
-    aux_ndarrays.push_back(*nd_aux_ptr);
-  }
-  NDArrayStorageType stype = (NDArrayStorageType) storage_type;
-  *out = new NDArray(*data_ptr, aux_ndarrays, ctx, stype, TShape(shape, shape + ndim));
-  API_END();
-}
-
 int MXNDArrayCreateEx(const mx_uint *shape,
                     mx_uint ndim,
                     int dev_type,
@@ -181,16 +156,26 @@ int MXNDArrayCreateSparseEx(int storage_type,
                     int dtype,
                     mx_uint num_aux,
                     int *aux_type,
+                    mx_uint *aux_ndims,
+                    const mx_uint *aux_shape,
                     NDArrayHandle *out) {
   API_BEGIN();
   std::vector<int> aux_types;
-  for (size_t i = 0; i < num_aux; i++) aux_types.push_back(aux_type[i]);
+  std::vector<TShape> aux_shapes;
+  auto shape_start = aux_shape;
+  for (size_t i = 0; i < num_aux; i++) {
+    // types
+    aux_types.push_back(aux_type[i]);
+    // shapes
+    aux_shapes.emplace_back(shape_start, shape_start + aux_ndims[i]);
+    shape_start += aux_ndims[i];
+  }
   *out = new NDArray(
       NDArrayStorageType(storage_type),
       TShape(shape, shape + ndim),
       Context::Create(static_cast<Context::DeviceType>(dev_type), dev_id),
       delay_alloc != 0,
-      dtype, aux_types);
+      dtype, aux_types, aux_shapes);
   API_END();
 }
 
@@ -409,17 +394,27 @@ int MXNDArrayGetDType(NDArrayHandle handle,
 
 int MXNDArrayGetAuxType(NDArrayHandle handle,
                         mx_uint i,
-                        int *out_aux_type) {
+                        int *out_type) {
+  API_BEGIN();
+  NDArray *arr = static_cast<NDArray*>(handle);
+  *out_type = arr->aux_type(i);
+  API_END();
+}
+
+int MXNDArrayGetAuxNDArray(NDArrayHandle handle,
+                           mx_uint i,
+                           NDArrayHandle *out) {
   API_BEGIN();
   NDArray *arr = static_cast<NDArray*>(handle);
-  *out_aux_type = arr->aux_type(i);
+  *out = new NDArray(arr->AuxNDArray(i));
   API_END();
 }
 
-int MXNDArrayGetNumAux(NDArrayHandle handle, mx_uint *out_num_aux) {
+int MXNDArrayGetDataNDArray(NDArrayHandle handle,
+                            NDArrayHandle *out) {
   API_BEGIN();
   NDArray *arr = static_cast<NDArray*>(handle);
-  *out_num_aux = arr->num_aux();
+  *out = new NDArray(arr->DataNDArray());
   API_END();
 }
 
diff --git a/src/c_api/c_api_ndarray.cc b/src/c_api/c_api_ndarray.cc
index 784d59ed8638..69266cefabee 100644
--- a/src/c_api/c_api_ndarray.cc
+++ b/src/c_api/c_api_ndarray.cc
@@ -249,8 +249,8 @@ void SetDependency(std::vector<engine::VarHandle> *p_read_vars,
     }
     CHECK_LE(ntmp, 1) << "Only support 1 temp space request";
   }
-  common::PrepVars(ndinputs, &read_vars);
-  common::PrepVars(ndoutputs, &write_vars);
+  for (auto& i : ndinputs) read_vars.emplace_back(i.var());
+  for (auto& i : ndoutputs) write_vars.emplace_back(i.var());
   if (mutate.count(op)) {
     auxidx = mutate[op](attrs);
     std::sort(auxidx.begin(), auxidx.end());
@@ -278,21 +278,21 @@ void PushFCompute(const FCompute& fn,
         RunContext rctx,
         engine::CallbackOnComplete on_complete) {
       std::vector<TBlob> input_blobs, output_blobs;
-      std::vector<NDArray> tmp_nds;
+      std::vector<NDArray> tmps;
 
       OpContext opctx{is_train, rctx,
                       engine::CallbackOnComplete(),
                       requested};
       if (ctx.dev_mask() == gpu::kDevMask) {
 #if MXNET_USE_CUDA
-        common::PrepDefaultBlobs<gpu>(ndinputs, ndoutputs, &input_blobs,
-                                      &output_blobs, &tmp_nds, true, opctx);
+        common::GetInputBlobs<gpu>(ndinputs, &input_blobs, &tmps, opctx);
+        common::GetOutputBlobs<gpu>(ndoutputs, &output_blobs, true);
 #else
         LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
 #endif
       } else {
-        common::PrepDefaultBlobs<cpu>(ndinputs, ndoutputs, &input_blobs,
-                                      &output_blobs, &tmp_nds, true, opctx);
+        common::GetInputBlobs<cpu>(ndinputs, &input_blobs, &tmps, opctx);
+        common::GetOutputBlobs<cpu>(ndoutputs, &output_blobs, true);
       }
       std::vector<OpReqType> req(output_blobs.size(), kWriteTo);
       fn(attrs, opctx, input_blobs, req, output_blobs);
diff --git a/src/common/utils.h b/src/common/utils.h
index d99c097a84c8..c46a27862054 100644
--- a/src/common/utils.h
+++ b/src/common/utils.h
@@ -37,33 +37,29 @@ namespace common {
 
 #if DMLC_USE_CXX11
 template <typename xpu>
-inline void PrepDefaultBlobs(const std::vector<NDArray>& ndinputs,
-                             const std::vector<NDArray>& ndoutputs,
-                             std::vector<TBlob> *input_blobs,
-                             std::vector<TBlob> *output_blobs,
-                             std::vector<NDArray> *tmp_nds,
-                             bool alloc_outputs,
-                             const OpContext& ctx) {
-  for (auto& i : ndinputs) {
-    if (i.storage_type() != kDefaultStorage) {
-      NDArray tmp_nd(i.shape(), i.ctx(), false);
-      op::CastStorageComputeEx<xpu>({}, ctx, {i}, {}, {tmp_nd});
-      tmp_nds->push_back(tmp_nd);
-      input_blobs->push_back(tmp_nd.data());
+inline void GetInputBlobs(const std::vector<NDArray>& nds,
+                          std::vector<TBlob> *blobs,
+                          std::vector<NDArray> *temps,
+                          const OpContext& ctx) {
+  for (auto& nd : nds) {
+    if (nd.storage_type() != kDefaultStorage) {
+      NDArray temp(nd.shape(), nd.ctx(), false);
+      op::CastStorageComputeEx<xpu>({}, ctx, {nd}, {}, {temp});
+      temps->push_back(temp);
+      blobs->push_back(temp.data());
     } else {
-      input_blobs->push_back(i.data());
+      blobs->push_back(nd.data());
     }
   }
-  for (auto& i : ndoutputs) {
-    if (alloc_outputs) i.CheckAndAlloc();
-    output_blobs->push_back(i.data());
-  }
 }
 
-inline void PrepVars(const std::vector<NDArray> &nds,
-                     std::vector<Engine::VarHandle> *vars) {
-  for (auto& i : nds) {
-    vars->push_back(i.var());
+template <typename xpu>
+inline void GetOutputBlobs(const std::vector<NDArray>& nds,
+                           std::vector<TBlob> *blobs,
+                           bool alloc) {
+  for (auto& nd : nds) {
+    if (alloc) nd.CheckAndAlloc();
+    blobs->push_back(nd.data());
   }
 }
 
diff --git a/src/executor/attach_op_execs_pass.cc b/src/executor/attach_op_execs_pass.cc
index 70db4d16fee0..f690c9b03e4c 100644
--- a/src/executor/attach_op_execs_pass.cc
+++ b/src/executor/attach_op_execs_pass.cc
@@ -8,14 +8,15 @@
 #include <mxnet/op_attr_types.h>
 #include <nnvm/graph_attr_types.h>
 #include "./exec_pass.h"
+#include "../common/utils.h"
 #if MXNET_USE_MKL2017 == 1
 #include <mkl_memory.h>
 #include "../operator/mkl/mkl_memory-inl.h"
 #include "../operator/mkl/mkl_util-inl.h"
 #endif
-#include "../common/utils.h"
 
-#define EXEC_DISPATCH_DEBUG 0
+#define EXEC_ATTACH_OP_DEBUG 0
+
 namespace mxnet {
 
 namespace op {
@@ -29,6 +30,26 @@ class ForwardOpExecutor : public OpExecutor {
  public:
   void Run(RunContext rctx, bool is_gpu) override {
     op_ctx.run_ctx = rctx;
+
+    // TODO(haibin) ForwardOp is stateful. If any input ndarray has non-default storage,
+    // we need to cast it to default storage and setup the tblobs again. For example,
+    // if any of the input ndarray chagnes, the updated value won't be reflected in the temporary
+    // ndarray with default storage. This is not efficient and should be improved later.
+    in_data_.clear(); out_data_.clear(); aux_data_.clear(); tmps_.clear();
+    if (is_gpu) {
+#if MXNET_USE_CUDA
+      common::GetInputBlobs<gpu>(in_array_, &in_data_, &tmps_, op_ctx);
+      common::GetInputBlobs<gpu>(aux_array_, &aux_data_, &tmps_, op_ctx);
+      common::GetOutputBlobs<gpu>(out_array, &out_data_, true);
+#else
+      LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
+#endif
+    } else {
+      common::GetInputBlobs<cpu>(in_array_, &in_data_, &tmps_, op_ctx);
+      common::GetInputBlobs<cpu>(aux_array_, &aux_data_, &tmps_, op_ctx);
+      common::GetOutputBlobs<cpu>(out_array, &out_data_, true);
+    }
+
     op_->Forward(op_ctx, in_data_, req, out_data_, aux_data_);
 #if MKL_EXPERIMENTAL == 1
     mkl_tblobs_prv_to_cpu(in_data_);
@@ -38,18 +59,14 @@ class ForwardOpExecutor : public OpExecutor {
   }
 
   void Setup() override {
-    in_data_.clear(); aux_data_.clear();
+    // We need to tell whether in NDArray is input or aux
     for (size_t i = 0; i < in_array.size(); ++i) {
       if (!std::binary_search(aux_index_.begin(), aux_index_.end(), i)) {
-        in_data_.push_back(in_array[i].data());
+        in_array_.emplace_back(in_array[i]);
       } else {
-        aux_data_.push_back(in_array[i].data());
+        aux_array_.emplace_back(in_array[i]);
       }
     }
-    out_data_.resize(out_array.size());
-    std::transform(out_array.begin(), out_array.end(), out_data_.begin(), [](const NDArray& nd) {
-        return nd.data();
-      });
   }
   Operator::ExecType exec_type() const override {
     return op_->exec_type();
@@ -65,6 +82,7 @@ class ForwardOpExecutor : public OpExecutor {
   std::shared_ptr<Operator> op_;
   std::vector<uint32_t> aux_index_;
   std::vector<TBlob> in_data_, out_data_, aux_data_;
+  std::vector<NDArray> in_array_, aux_array_, tmps_;
 };
 
 // backward executor
@@ -140,19 +158,22 @@ class FComputeExecutor : public OpExecutor {
  public:
   void Run(RunContext rctx, bool is_gpu) override {
     op_ctx.run_ctx = rctx;
-    if (!initialized) {
+    // setup blobs
+    // TODO(haibin) we should avoid repeating this if it's known that all inputs are in
+    // default-storage.
+    {
+      in_data_.clear(); out_data_.clear();
       if (is_gpu) {
 #if MXNET_USE_CUDA
-        common::PrepDefaultBlobs<gpu>(in_array, out_array, &in_data_,
-                                      &out_data_, &tmp_nds_, true, op_ctx);
+        common::GetInputBlobs<gpu>(in_array, &in_data_, &tmp_nds_, op_ctx);
+        common::GetOutputBlobs<gpu>(out_array, &out_data_, true);
 #else
         LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
 #endif
       } else {
-        common::PrepDefaultBlobs<cpu>(in_array, out_array, &in_data_,
-                                      &out_data_, &tmp_nds_, true, op_ctx);
+        common::GetInputBlobs<cpu>(in_array, &in_data_, &tmp_nds_, op_ctx);
+        common::GetOutputBlobs<cpu>(out_array, &out_data_, true);
       }
-      initialized = true;
     }
     fcompute_(attrs_, op_ctx, in_data_, req, out_data_);
 #if MKL_EXPERIMENTAL == 1
@@ -160,10 +181,7 @@ class FComputeExecutor : public OpExecutor {
     mkl_tblobs_prv_to_cpu(out_data_);
 #endif
   }
-  void Setup() override {
-    in_array_ = in_array;
-    out_array_ = out_array;
-  }
+  void Setup() override {}
   Operator::ExecType exec_type() const override {
     return Operator::kSync;
   }
@@ -175,8 +193,7 @@ class FComputeExecutor : public OpExecutor {
   FCompute fcompute_;
   NodeAttrs attrs_;
   std::vector<TBlob> in_data_, out_data_;
-  std::vector<NDArray> in_array_, out_array_, tmp_nds_;
-  bool initialized = false;
+  std::vector<NDArray> tmp_nds_;
 };
 
 // fcomputend executor
@@ -236,7 +253,7 @@ Graph AttachOpExecs(Graph g) {
     FCompute fcompute = common::GetFCompute(inode.source->op(), vctx[i]);
     FComputeEx fcompute_ex =
       common::GetFComputeEx(inode.source->op(), vctx[i], dispatch_stypes[i]);
-#if EXEC_DISPATCH_DEBUG
+#if EXEC_ATTACH_OP_DEBUG
     LOG(INFO) << "dispatch type = " << dispatch_stypes[i];
 #endif
     if (fcreate_layer_op.count(inode.source->op())) {
@@ -254,22 +271,30 @@ Graph AttachOpExecs(Graph g) {
               inode.source->attrs, vctx[i], ishape, itype));
       }
       ret[i] = std::make_shared<ForwardOpExecutor>(opr, mutate_index);
+#if EXEC_ATTACH_OP_DEBUG
+      LOG(INFO) << "ForwardOp for op " << inode.source->op()->name;
+#endif
     } else if (is_layer_backward.get(inode.source->op(), false)) {
       CHECK_GE(inode.control_deps.size(), 1);
       uint32_t fwd_id = inode.control_deps[0];
       CHECK(vctx[fwd_id] == vctx[i]);
       CHECK(ret[fwd_id] != nullptr);
+      CHECK_EQ(dispatch_stypes[i], kDefaultStorage)
+               << "BackwardOp doesn't handle non-default storage yet";
       ret[i] = std::make_shared<BackwardOpExecutor>(
           dynamic_cast<ForwardOpExecutor*>(ret[fwd_id].get())->op_,
           mxnet::op::OpPropGetOpProperty(inode.source->attrs),
           mutate_index);
+#if EXEC_ATTACH_OP_DEBUG
+      LOG(INFO) << "BackwardOp for op " << inode.source->op()->name;
+#endif
     } else if (fcompute_ex != nullptr) {
-#if EXEC_DISPATCH_DEBUG
+#if EXEC_ATTACH_OP_DEBUG
       LOG(INFO) << "FComputeEx for op " << inode.source->op()->name;
 #endif
       ret[i] = std::make_shared<FComputeExExecutor>(fcompute_ex, inode.source->attrs);
     } else if (fcompute != nullptr) {
-#if EXEC_DISPATCH_DEBUG
+#if EXEC_ATTACH_OP_DEBUG
       LOG(INFO) << "FCompute for op " << inode.source->op()->name;
 #endif
       ret[i] = std::make_shared<FComputeExecutor>(fcompute, inode.source->attrs);
diff --git a/src/executor/exec_pass.h b/src/executor/exec_pass.h
index b23f7fa47fc9..20535be320d9 100644
--- a/src/executor/exec_pass.h
+++ b/src/executor/exec_pass.h
@@ -32,7 +32,7 @@ const int kNonDefaultStorage = -2;
  */
 class OpExecutor {
  public:
-  /*! \brief input arrays */
+  /*! \brief input data arrays, which may be either input or aux */
   std::vector<NDArray> in_array;
   /*! \brief output data arrays */
   std::vector<NDArray> out_array;
diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc
index 5e63a04f7140..e8fd1ed390da 100644
--- a/src/executor/graph_executor.cc
+++ b/src/executor/graph_executor.cc
@@ -403,29 +403,34 @@ Graph GraphExecutor::InitGraph(nnvm::Symbol symbol,
   size_t arg_top = 0, aux_top = 0;
   for (size_t i = 0; i < num_forward_inputs_; ++i) {
     const uint32_t nid = idx.input_nodes().at(i);
+    size_t eid = idx.entry_id(nid, 0);
     if (mutable_nodes.count(nid)) {
       CHECK_LT(aux_top, aux_states.size());
-      data_entry_[idx.entry_id(nid, 0)] = aux_states[aux_top];
+      data_entry_[eid] = aux_states[aux_top];
       arg_shapes.push_back(aux_states[aux_top].shape());
       arg_types.push_back(aux_states[aux_top].dtype());
       arg_storage_types.push_back(aux_states[aux_top].storage_type());
       ++aux_top;
     } else {
       CHECK_LT(arg_top, in_args.size());
-      data_entry_[idx.entry_id(nid, 0)] = in_args[arg_top];
+      data_entry_[eid] = in_args[arg_top];
       arg_shapes.push_back(in_args[arg_top].shape());
       arg_types.push_back(in_args[arg_top].dtype());
       arg_storage_types.push_back(in_args[arg_top].storage_type());
       ++arg_top;
     }
-    // LOG(INFO) << "update data_entry_[ " << idx.entry_id(nid, 0) << "]"
-    //  << " " << data_entry_[idx.entry_id(nid, 0)].storage_type();
+#if EXECUTOR_DEBUG
+     LOG(INFO) << "assign data entry " << eid << "\tas stype "
+               << data_entry_[eid].storage_type() << " (input)";
+#endif
   }
   for (size_t j = num_forward_outputs_; j < idx.outputs().size(); ++j) {
-    data_entry_[idx.entry_id(idx.outputs()[j])]
-        = grad_store_[j - num_forward_outputs_].second;
-    // LOG(INFO) << "update data_entry_[ " << idx.entry_id(idx.outputs()[j]) << "]"
-    //  << " " << data_entry_[idx.entry_id(idx.outputs()[j])].storage_type() << "(output)";
+    auto eid = idx.entry_id(idx.outputs()[j]);
+    data_entry_[eid] = grad_store_[j - num_forward_outputs_].second;
+#if EXECUTOR_DEBUG
+     LOG(INFO) << "assign data entry " << eid << "\tas stype "
+               << data_entry_[eid].storage_type() << " (output)";
+#endif
   }
   arg_shapes.resize(idx.input_nodes().size(), TShape());
   arg_types.resize(idx.input_nodes().size(), -1);
@@ -520,17 +525,19 @@ void GraphExecutor::InitDataEntryMemory(std::vector<NDArray>* shared_pool) {
     uint32_t nid = idx.input_nodes().at(i);
     uint32_t oid = head_grad_map_.at(idx[nid].source);
     uint32_t eid = idx.entry_id(idx.outputs()[oid]);
-    NDArrayStorageType storage_type = (NDArrayStorageType) vstorage_type[eid];
+    NDArrayStorageType stype = (NDArrayStorageType) vstorage_type[eid];
     CHECK_NE(vshape[eid].ndim(), 0U);
     CHECK_NE(vdtype[eid], -1);
-    // init NDArray based on storage_type
-    if (storage_type != kDefaultStorage) {
-      data_entry_[idx.entry_id(nid, 0)] =
-        NDArray(storage_type, vshape[eid], data_context[eid], true, vdtype[eid]);
+    auto data_eid = idx.entry_id(nid, 0);
+    // initialize based on storage_type
+    if (stype != kDefaultStorage) {
+      data_entry_[data_eid] = NDArray(stype, vshape[eid], data_context[eid], true, vdtype[eid]);
     } else {
-      data_entry_[idx.entry_id(nid, 0)] =
-        NDArray(vshape[eid], data_context[eid], false, vdtype[eid]);
+      data_entry_[data_eid] = NDArray(vshape[eid], data_context[eid], false, vdtype[eid]);
     }
+#if EXECUTOR_DEBUG
+    LOG(INFO) << "init   data entry " << data_eid << "\tas stype " << stype << "(head_grad)";
+#endif
   }
   // get maximum bytes in each pool
   for (size_t i = 0; i < vshape.size(); ++i) {
@@ -546,7 +553,6 @@ void GraphExecutor::InitDataEntryMemory(std::vector<NDArray>* shared_pool) {
     if (info.bytes == 0) {
       info = PoolEntry{data_context[i], bytes, data_storage_type[i]};
     } else {
-      // std::cout << "WARNING Updated info.bytes" << std::endl;
       info.bytes = std::max(info.bytes, bytes);
     }
   }
@@ -600,7 +606,6 @@ void GraphExecutor::InitDataEntryMemory(std::vector<NDArray>* shared_pool) {
     }
   }
   // assign the data entries
-
   for (size_t i = 0; i < data_entry_.size(); ++i) {
     // avoid pre-allocated arrays
     if (!data_entry_[i].is_none()) continue;
@@ -612,8 +617,11 @@ void GraphExecutor::InitDataEntryMemory(std::vector<NDArray>* shared_pool) {
       const NDArray& src = data_pool_.at(storage_id);
       data_entry_[i] = src.AsArray(vshape[i], vdtype[i]);
     } else {
-      data_entry_[i] = NDArray(storage_type, vshape[i], vctx[i]);
+      data_entry_[i] = NDArray(storage_type, vshape[i], data_context[i]);
     }
+#if EXECUTOR_DEBUG
+    LOG(INFO) << "init   data entry " << i << " as stype " << storage_type;
+#endif
   }
 }
 
@@ -871,7 +879,7 @@ void GraphExecutor::RunOps(bool is_train, size_t topo_start, size_t topo_end) {
       bool profiling = false;
 #endif
 #if EXECUTOR_DEBUG
-      LOG(INFO) << "Running node " << nid << " - " << seg_op.topo_end - 1;
+      LOG(INFO) << "Run node " << nid << " - " << seg_op.topo_end - 1;
 #endif
       Engine::Get()->Push(seg_op.opr, seg_op.ctx, 0, profiling);
       nid = seg_op.topo_end - 1;
@@ -884,7 +892,7 @@ void GraphExecutor::RunOps(bool is_train, size_t topo_start, size_t topo_end) {
     if (op_nodes_[nid].skip_exec_node) continue;
     opnode.exec->op_ctx.is_train = is_train;
 #if EXECUTOR_DEBUG
-      LOG(INFO) << "Running node " << nid;
+      LOG(INFO) << "Run node " << nid;
 #endif
     if (opnode.exec->exec_type() == Operator::kCrossDeviceCopy) {
       CHECK_EQ(inode.inputs.size(), 1U);
diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc
index 94616855459c..e9f3195feed0 100644
--- a/src/ndarray/ndarray.cc
+++ b/src/ndarray/ndarray.cc
@@ -11,6 +11,7 @@
 #include <mxnet/resource.h>
 #include <mshadow/tensor.h>
 #include "./ndarray_function.h"
+#include "../operator/tensor/init_op.h"
 
 #if MXNET_USE_OPENCV
 #include <opencv2/opencv.hpp>
@@ -232,12 +233,12 @@ void ScalarOp(const NDArray &lhs,
   }
 }
 
+
 void CopyFromTo(const NDArray &from, NDArray *to, int priority, bool alloc_output) {
   if (from.var() == to->var()) {
     // skip to copy to itself
     return;
   }
-  CHECK_EQ(from.storage_type(), to->storage_type()) << "Copying with different storage type";
   CHECK(from.shape() == to->shape())
       << "operands shape mismatch"
       << "from.shape = " << from.shape() << " to.shape=" << to->shape();
@@ -253,60 +254,28 @@ void CopyFromTo(const NDArray &from, NDArray *to, int priority, bool alloc_outpu
 
   if (a == cpu::kDevMask && b == cpu::kDevMask) {
     Engine::Get()->PushSync([from, ret, alloc](RunContext ctx) {
-        auto storage_type = from.storage_type();
-        if (storage_type == kDefaultStorage) {
-          if (alloc) ret.CheckAndAlloc();
-          TBlob tmp = ret.data();
-          ndarray::Copy<cpu, cpu>(from.data(), &tmp,
-                                  from.ctx(), ret.ctx(), ctx);
-        } else if (storage_type == kRowSparseStorage) {
-          auto aux_shape = from.aux_shape(0);
-          if (aux_shape.Size() == 0) return;
-          if (alloc) ret.CheckAndAlloc({aux_shape});
-          TBlob val = ret.data();
-          TBlob idx = ret.aux_data(rowsparse::kIdx);
-          ndarray::Copy<cpu, cpu>(from.data(), &val,
-                                  from.ctx(), ret.ctx(), ctx);
-          ndarray::Copy<cpu, cpu>(from.aux_data(rowsparse::kIdx), &idx,
-                                  from.ctx(), ret.ctx(), ctx);
-        } else {
-          LOG(FATAL) << "Not implemented yet";
-        }
+        NDArray nd(ret);
+        CopyFromToImpl<cpu, cpu>(from, &nd, ctx, alloc);
       }, from.ctx(), const_vars, {ret.var()},
       FnProperty::kNormal, priority, PROFILER_MESSAGE("CopyCPU2CPU"));
   } else {
 #if MXNET_USE_CUDA
     if (a == cpu::kDevMask && b == gpu::kDevMask) {
       Engine::Get()->PushSync([from, ret, alloc](RunContext ctx) {
-          if (from.storage_type() != kDefaultStorage) LOG(FATAL) << "GPU not implemented yet";
-          if (alloc) ret.CheckAndAlloc();
-          TBlob tmp = ret.data();
-          ndarray::Copy<cpu, gpu>(from.data(), &tmp,
-                                  from.ctx(), ret.ctx(), ctx);
-          // Wait GPU kernel to complete
-          ctx.get_stream<gpu>()->Wait();
+          NDArray nd(ret);
+          CopyFromToImpl<cpu, gpu>(from, &nd, ctx, alloc);
         }, ret.ctx(), const_vars, {ret.var()},
         FnProperty::kCopyToGPU, priority, PROFILER_MESSAGE("CopyCPU2GPU"));
     } else if (a == gpu::kDevMask && b == cpu::kDevMask) {
       Engine::Get()->PushSync([from, ret, alloc](RunContext ctx) {
-          if (from.storage_type() != kDefaultStorage) LOG(FATAL) << "GPU not implemented yet";
-          if (alloc) ret.CheckAndAlloc();
-          TBlob tmp = ret.data();
-          ndarray::Copy<gpu, cpu>(from.data(), &tmp,
-                                  from.ctx(), ret.ctx(), ctx);
-          // Wait GPU kernel to complete
-          ctx.get_stream<gpu>()->Wait();
+          NDArray nd(ret);
+          CopyFromToImpl<gpu, cpu>(from, &nd, ctx, alloc);
         }, from.ctx(), const_vars, {ret.var()},
         FnProperty::kCopyFromGPU, priority, PROFILER_MESSAGE("CopyGPU2CPU"));
     } else if (a == gpu::kDevMask && b == gpu::kDevMask) {
       Engine::Get()->PushSync([from, ret, alloc](RunContext ctx) {
-          if (from.storage_type() != kDefaultStorage) LOG(FATAL) << "GPU not implemented yet";
-          if (alloc) ret.CheckAndAlloc();
-          TBlob tmp = ret.data();
-          ndarray::Copy<gpu, gpu>(from.data(), &tmp,
-                                  from.ctx(), ret.ctx(), ctx);
-          // Wait GPU kernel to complete
-          ctx.get_stream<gpu>()->Wait();
+          NDArray nd(ret);
+          CopyFromToImpl<gpu, gpu>(from, &nd, ctx, alloc);
         }, from.ctx(), const_vars, {ret.var()},
         from.dtype() != ret.dtype() ? FnProperty::kNormal : FnProperty::kCopyFromGPU,
         priority, PROFILER_MESSAGE("CopyGPU2GPU"));
diff --git a/src/operator/elemwise_op_common.h b/src/operator/elemwise_op_common.h
index c8a7b0c034e5..f4315b62a6a8 100644
--- a/src/operator/elemwise_op_common.h
+++ b/src/operator/elemwise_op_common.h
@@ -61,12 +61,10 @@ template<typename AttrType, bool (*is_none)(const AttrType&),
 inline bool ElemwiseStorageAttr(const nnvm::NodeAttrs& attrs,
                          std::vector<AttrType> *in_attrs,
                          std::vector<AttrType> *out_attrs) {
-  // LOG(INFO) << "ElemwiseStorageAttr for " << attrs.name;
   auto deduce = [&](std::vector<AttrType> *vec, const char *name, AttrType& result,
                     bool fallback) {
       auto &v = *vec;
       for (size_t i = 0; i < vec->size(); ++i) {
-        // LOG(INFO) << "deduce " << (*vec)[i];
         if (v[i] == kUndefinedStorage) {
           // if input type is unknown, assume it's default storage
           CHECK(assign(&v[i], kDefaultStorage));
@@ -123,12 +121,16 @@ inline bool ElemwiseStorageType(const nnvm::NodeAttrs& attrs,
 }
 
 inline bool IdentityAttrLikeRhsStorageType(const nnvm::NodeAttrs& attrs,
-                         std::vector<int> *in_attrs,
-                         std::vector<int> *out_attrs) {
+                                           std::vector<int> *in_attrs,
+                                           std::vector<int> *out_attrs) {
   CHECK_EQ(in_attrs->size(), static_cast<size_t>(2)) << " in operator " << attrs.name;
   CHECK_EQ(out_attrs->size(), static_cast<size_t>(1)) << " in operator " << attrs.name;
-  return ElemwiseStorageAttr<int, type_is_none, type_assign, false, false>(
-    attrs, in_attrs, out_attrs);
+  auto &in = *in_attrs;
+  auto &out = *out_attrs;
+  CHECK_NE(in[1], kUndefinedStorage) << "rhs storage type must be known";
+  if (in[0] == kUndefinedStorage) in[0] = in[1];
+  if (out[0] == kUndefinedStorage) out[0] = in[1];
+  return true;
 }
 
 // Transfer gradient and input to FGradient function
@@ -163,6 +165,22 @@ struct ElemwiseGradUseNone {
   }
 };
 
+// TODO(haibin) this is a temporary function for debugging purpose. Remove later.
+template <int dim, typename DType>
+void print_info(const mshadow::Tensor<cpu, dim, DType>& tensor, const std::string& name) {
+  std::cout << "Tensor " << name << " with shape (";
+  int len = 1;
+  for (int i = 0; i < dim; i++) {
+    len *= tensor.shape_[i];
+    std::cout << tensor.shape_[i] << ",";
+    if (i == dim - 1) std::cout << ")";
+  }
+  std::cout << std::endl;
+  for (int j = 0; j < len; j ++) std::cout << tensor.dptr_[j] << " ";
+  std::cout << std::endl;
+}
+
+
 }  // namespace op
 }  // namespace mxnet
 
diff --git a/src/operator/operator_common.h b/src/operator/operator_common.h
index 205a6e337a2a..78c03685f36f 100755
--- a/src/operator/operator_common.h
+++ b/src/operator/operator_common.h
@@ -313,17 +313,17 @@ inline void ParamParser(nnvm::NodeAttrs* attrs) {
 }
 
 template <typename xpu>
-void FComputeExFallback(const nnvm::NodeAttrs& attrs,
-                        const OpContext& ctx,
-                        const std::vector<NDArray>& inputs,
-                        const std::vector<OpReqType>& req,
-                        const std::vector<NDArray>& outputs,
-                        FCompute fcompute) {
-  std::vector<TBlob> input_blobs, output_blobs;
-  std::vector<NDArray> tmp_nds;
-  common::PrepDefaultBlobs<xpu>(inputs, outputs, &input_blobs,
-                                &output_blobs, &tmp_nds, false, ctx);
-  fcompute(attrs, ctx, input_blobs, req, output_blobs);
+void FCompExFallback(const nnvm::NodeAttrs& attrs,
+                     const OpContext& ctx,
+                     const std::vector<NDArray>& inputs,
+                     const std::vector<OpReqType>& req,
+                     const std::vector<NDArray>& outputs,
+                     FCompute fcompute) {
+  std::vector<TBlob> in_blobs, out_blobs;
+  std::vector<NDArray> tmps;
+  common::GetInputBlobs<xpu>(inputs, &in_blobs, &tmps, ctx);
+  common::GetOutputBlobs<xpu>(outputs, &out_blobs, false);
+  fcompute(attrs, ctx, in_blobs, req, out_blobs);
 }
 
 
diff --git a/src/operator/optimizer_op-inl.h b/src/operator/optimizer_op-inl.h
index a4115f5960eb..04d7d03a52ba 100755
--- a/src/operator/optimizer_op-inl.h
+++ b/src/operator/optimizer_op-inl.h
@@ -111,8 +111,9 @@ struct SparseSGDDnsRspKernel {
   // IType is row sparse idx type
   // i is the ith row in row sparse gradient
   template<typename DType, typename IType>
-  MSHADOW_XINLINE static void Map(int i, size_t width, DType* out, const DType* weight, const IType* grad_idx,
-                                  const DType *grad_val, const DType clip_gradient, const DType lr,
+  MSHADOW_XINLINE static void Map(int i, size_t width, DType* out, const DType* weight,
+                                  const IType* grad_idx, const DType *grad_val,
+                                  const DType clip_gradient, const DType lr,
                                   const DType wd, const DType rescale_grad) {
     for (size_t j = 0; j < width; j++) {
       uint64_t data_i = grad_idx[i] * width + j;
@@ -159,7 +160,6 @@ inline void SparseSGDUpdateDnsRspImpl(const SGDParam& param,
           static_cast<DType>(param.clip_gradient),
           static_cast<DType>(param.lr), static_cast<DType>(param.wd),
           static_cast<DType>(param.rescale_grad));
-
       });
     });
   });
diff --git a/src/operator/optimizer_op.cc b/src/operator/optimizer_op.cc
index 27c39db864f5..9a776a94bf0c 100644
--- a/src/operator/optimizer_op.cc
+++ b/src/operator/optimizer_op.cc
@@ -46,7 +46,7 @@ It updates the weights using::
 .set_attr_parser(ParamParser<SGDParam>)
 .set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<2, 1>)
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<2, 1>)
-// TODO write FCompute for sparse sgd
+// TODO(haibin) write FCompute for sparse sgd
 // .set_attr<FCompute>("FCompute<cpu>", SGDUpdate<cpu>)
 .set_attr<FComputeEx>(FCOMP_EX_CPU, SparseSGDUpdateEx<cpu>)
 .add_argument("weight", "NDArray-or-Symbol", "Weight")
diff --git a/src/operator/optimizer_op.cu b/src/operator/optimizer_op.cu
index 2b2667ec317b..24c2d8d7fd02 100644
--- a/src/operator/optimizer_op.cu
+++ b/src/operator/optimizer_op.cu
@@ -15,6 +15,9 @@ NNVM_REGISTER_OP(sgd_update)
 NNVM_REGISTER_OP(sgd_mom_update)
 .set_attr<FCompute>("FCompute<gpu>", SGDMomUpdate<gpu>);
 
+NNVM_REGISTER_OP(sparse_sgd_update)
+.set_attr<FComputeEx>(FCOMP_EX_GPU, SparseSGDUpdateEx<gpu>);
+
 NNVM_REGISTER_OP(adam_update)
 .set_attr<FCompute>("FCompute<gpu>", AdamUpdate<gpu>);
 
diff --git a/src/operator/tensor/elemwise_binary_op.h b/src/operator/tensor/elemwise_binary_op.h
index 0b4ce1a6b381..362e6b4805e0 100644
--- a/src/operator/tensor/elemwise_binary_op.h
+++ b/src/operator/tensor/elemwise_binary_op.h
@@ -41,42 +41,35 @@ void BinaryComputeRspRsp(const nnvm::NodeAttrs& attrs,
                          const std::vector<NDArray>& inputs,
                          const std::vector<OpReqType>& req,
                          const std::vector<NDArray>& outputs) {
-  CHECK_EQ(inputs.size(), 2);
-  CHECK_EQ(outputs.size(), 1);
-  auto &nd_l = inputs[0];
-  auto &nd_r = inputs[1];
+  auto &lhs = inputs[0];
+  auto &rhs = inputs[1];
   auto &output = outputs[0];
 
-  CHECK_EQ(nd_l.storage_type(), kRowSparseStorage) << "Sparse type not supported yet";
-  // Memory Estimation
-  unsigned int num_rows_l = nd_l.aux_shape(rowsparse::kIdx).Size();
-  unsigned int num_rows_r = nd_r.aux_shape(rowsparse::kIdx).Size();
-  // TODO(haibin) both zero?
-  if (num_rows_l + num_rows_r == 0) {
+  bool init_l = lhs.storage_initialized();
+  bool init_r = rhs.storage_initialized();
+  // both inputs are zeros
+  if (!init_l && !init_r) return;
+  // one of the input is zeros
+  if (!init_l || !init_r) {
+    NDArray out(output);
+    CopyFromToRspImpl<xpu, xpu>(!init_l ? rhs : lhs, &out, ctx.run_ctx, true);
     return;
   }
-  // This is (roughly) the number of result rows
+  // Memory Estimation: This is (roughly) the number of result rows. We still
+  // need to subtract the number of common rows
+  unsigned int num_rows_l = lhs.aux_shape(rowsparse::kIdx).Size();
+  unsigned int num_rows_r = rhs.aux_shape(rowsparse::kIdx).Size();
   output.CheckAndAlloc({TShape({num_rows_l + num_rows_r})});
   mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
-  if (num_rows_l == 0) {
-    NDArray out(output);
-    CopyFromTo(nd_r, &out);
-    return;
-  }
-  if (num_rows_r == 0) {
-    NDArray out(output);
-    CopyFromTo(nd_l, &out);
-    return;
-  }
   MSHADOW_TYPE_SWITCH(output.dtype(), DType, {
-    MSHADOW_TYPE_SWITCH(nd_l.aux_type(rowsparse::kIdx), IType, {
+    MSHADOW_TYPE_SWITCH(lhs.aux_type(rowsparse::kIdx), IType, {
       // Indices
-      auto indices_l = nd_l.aux_data(rowsparse::kIdx).FlatTo1D<xpu, IType>(s);
-      auto indices_r = nd_r.aux_data(rowsparse::kIdx).FlatTo1D<xpu, IType>(s);
+      auto indices_l = lhs.aux_data(rowsparse::kIdx).FlatTo1D<xpu, IType>(s);
+      auto indices_r = rhs.aux_data(rowsparse::kIdx).FlatTo1D<xpu, IType>(s);
       auto indices_out = output.aux_data(rowsparse::kIdx).FlatTo1D<xpu, IType>(s);
       // Data
-      auto data_l = nd_l.data().FlatTo2D<xpu, DType>(s);
-      auto data_r = nd_r.data().FlatTo2D<xpu, DType>(s);
+      auto data_l = lhs.data().FlatTo2D<xpu, DType>(s);
+      auto data_r = rhs.data().FlatTo2D<xpu, DType>(s);
       auto out = output.data().FlatTo2D<xpu, DType>(s);
 
       // TODO(haibin) A more appropriate way: Copy to output, then apply ops
@@ -134,12 +127,12 @@ void BinaryComputeEx(const nnvm::NodeAttrs& attrs,
   if (typeid(OP) == typeid(mshadow::op::plus)) {
     // If any input is dense, fallback to FCompute
     if (common::ContainsDefaultStorage(inputs)) {
-      FComputeExFallback<xpu>(attrs, ctx, inputs, req, outputs, BinaryCompute<xpu, OP>);
+      FCompExFallback<xpu>(attrs, ctx, inputs, req, outputs, BinaryCompute<xpu, OP>);
       return;
     }
     CHECK_EQ(inputs[0].storage_type(), kRowSparseStorage) << "Sparse type not supported yet";
     CHECK_EQ(inputs[1].storage_type(), kRowSparseStorage) << "Sparse type not supported yet";
-    BinaryComputeRspRsp<xpu, Op>(attrs, ctx, inputs, req, outputs);
+    BinaryComputeRspRsp<xpu, OP>(attrs, ctx, inputs, req, outputs);
     return;
   } else {
     LOG(FATAL) << "Not implemented";
@@ -175,6 +168,8 @@ void BinaryBackwardUseNoneRsp(const nnvm::NodeAttrs& attrs,
   using namespace mshadow::expr;
   Stream<xpu> *s = ctx.get_stream<xpu>();
   CHECK_EQ(inputs[0].storage_type(), kRowSparseStorage);
+  CHECK_EQ(outputs[0].storage_type(), kRowSparseStorage);
+  CHECK_EQ(outputs[1].storage_type(), kRowSparseStorage);
   CHECK(typeid(LOP) == typeid(mshadow_op::identity));
   CHECK(typeid(ROP) == typeid(mshadow_op::identity));
   TShape shape = inputs[0].aux_shape(rowsparse::kIdx);
@@ -208,6 +203,7 @@ void BinaryBackwardUseNoneEx(const nnvm::NodeAttrs& attrs,
   auto stype = inputs[0].storage_type();
   CHECK_EQ(stype, kRowSparseStorage) << "Not implemented yet";
   BinaryBackwardUseNoneRsp<xpu, LOP, ROP>(attrs, ctx, inputs, req, outputs);
+  // TODO(haibin) fallback for kDefaultStorage
 }
 
 template<typename xpu, typename LOP, typename ROP>
diff --git a/src/operator/tensor/elemwise_binary_op_basic.cu b/src/operator/tensor/elemwise_binary_op_basic.cu
index e2a0eb75c579..4f88aecfdf9f 100644
--- a/src/operator/tensor/elemwise_binary_op_basic.cu
+++ b/src/operator/tensor/elemwise_binary_op_basic.cu
@@ -9,7 +9,8 @@
 namespace mxnet {
 namespace op {
 NNVM_REGISTER_OP(elemwise_add)
-.set_attr<FCompute>("FCompute<gpu>", BinaryCompute<gpu, mshadow::op::plus>);
+.set_attr<FCompute>("FCompute<gpu>", BinaryCompute<gpu, mshadow::op::plus>)
+.set_attr<FComputeEx>(FCOMP_EX_GPU, BinaryComputeEx<gpu, mshadow::op::plus>);
 
 NNVM_REGISTER_OP(_grad_add)
 .set_attr<FCompute>("FCompute<gpu>", BinaryCompute<gpu, mshadow::op::plus>);
@@ -17,7 +18,9 @@ NNVM_REGISTER_OP(_grad_add)
 NNVM_REGISTER_OP(_backward_add)
 .set_attr<FCompute>("FCompute<gpu>",
                     BinaryBackwardUseNone<gpu,
-                    mshadow_op::identity, mshadow_op::identity>);
+                    mshadow_op::identity, mshadow_op::identity>)
+.set_attr<FComputeEx>(FCOMP_EX_GPU,
+                      BinaryBackwardUseNoneEx<gpu, mshadow_op::identity, mshadow_op::identity>);
 
 NNVM_REGISTER_OP(_sub)
 .set_attr<FCompute>("FCompute<gpu>", BinaryCompute<gpu, mshadow::op::minus>);
diff --git a/src/operator/tensor/elemwise_unary_op.cc b/src/operator/tensor/elemwise_unary_op.cc
index 879af02373a9..ae9673621570 100644
--- a/src/operator/tensor/elemwise_unary_op.cc
+++ b/src/operator/tensor/elemwise_unary_op.cc
@@ -83,7 +83,7 @@ NNVM_REGISTER_OP(_identity_with_attr_like_rhs)
 .set_attr<nnvm::FIgnoreInputs>("FIgnoreInputs",
     [](const NodeAttrs& attrs) { return std::vector<uint32_t>(1, 1); })
 .set_attr<FCompute>("FCompute<cpu>", IdentityCompute<cpu>)
-.set_attr<FComputeEx>(FCOMP_EX_CPU, IdentityComputeEx<cpu>)
+.set_attr<FComputeEx>(FCOMP_EX_CPU, IdentityLikeRhsComputeEx<cpu>)
 .set_attr<nnvm::FInferShape>("FInferShape", ElemwiseShape<2, 1>)
 .set_attr<nnvm::FInferStorageType>("FInferStorageType", IdentityAttrLikeRhsStorageType)
 .set_attr<nnvm::FGradient>(
diff --git a/src/operator/tensor/elemwise_unary_op.cu b/src/operator/tensor/elemwise_unary_op.cu
index c5a72b4e8c4f..24b8a25ecbae 100644
--- a/src/operator/tensor/elemwise_unary_op.cu
+++ b/src/operator/tensor/elemwise_unary_op.cu
@@ -23,7 +23,9 @@ NNVM_REGISTER_OP(make_loss)
 
 // identity output as first input, but attributes are constrainted to be like rhs
 NNVM_REGISTER_OP(_identity_with_attr_like_rhs)
-.set_attr<FCompute>("FCompute<gpu>", IdentityCompute<gpu>);
+.set_attr<FCompute>("FCompute<gpu>", IdentityCompute<gpu>)
+.set_attr<FComputeEx>(FCOMP_EX_GPU, IdentityLikeRhsComputeEx<gpu>);
+
 
 NNVM_REGISTER_OP(Cast)
 .set_attr<FCompute>("FCompute<gpu>", CastCompute<gpu>);
diff --git a/src/operator/tensor/elemwise_unary_op.h b/src/operator/tensor/elemwise_unary_op.h
index 48aac5471bc8..2380c9d41dfd 100644
--- a/src/operator/tensor/elemwise_unary_op.h
+++ b/src/operator/tensor/elemwise_unary_op.h
@@ -68,32 +68,27 @@ void IdentityComputeRsp(const nnvm::NodeAttrs& attrs,
   using namespace mshadow;
   using namespace mshadow::expr;
   Stream<xpu> *s = ctx.get_stream<xpu>();
-  NDArrayStorageType storage_type = inputs[1].storage_type();
-  CHECK_EQ(storage_type, kRowSparseStorage);
-  if (req[0] == kNullOp) {
-    LOG(FATAL) << "kNullOp in IdentityComputeEx not supported yet";
-  }
-  if (req[0] == kWriteInplace) {
-    LOG(FATAL) << "kWriteInplace for sparse storage not supported yet";
-  }
-  TShape shape = inputs[1].aux_shape(rowsparse::kIdx);
-  if (shape.ndim() == 0) return;
-  outputs[0].CheckAndAlloc({shape});
-  MSHADOW_TYPE_SWITCH(outputs[0].dtype(), DType, {
-    MSHADOW_TYPE_SWITCH(outputs[0].aux_type(rowsparse::kIdx), AuxType, {
-      auto out_d = outputs[0].data().FlatTo1D<xpu, DType>(s);
-      auto out_aux = outputs[0].aux_data(rowsparse::kIdx).FlatTo1D<xpu, AuxType>(s);
-      auto in_aux = inputs[1].aux_data(rowsparse::kIdx).FlatTo1D<xpu, AuxType>(s);
+  auto &input = inputs[0];
+  auto &output = outputs[0];
+  CHECK_NE(req[0], kNullOp) << "kNullOp in IdentityComputeEx not supported yet";
+  CHECK_NE(req[0], kWriteInplace) << "kWriteInplace in IdentityComputeEx not supported yet";
+  if (!input.storage_initialized()) return;
+  TShape shape = input.aux_shape(rowsparse::kIdx);
+  output.CheckAndAlloc({shape});
+  MSHADOW_TYPE_SWITCH(output.dtype(), DType, {
+    MSHADOW_TYPE_SWITCH(output.aux_type(rowsparse::kIdx), AuxType, {
+      auto out_d = output.data().FlatTo1D<xpu, DType>(s);
+      auto out_aux = output.aux_data(rowsparse::kIdx).FlatTo1D<xpu, AuxType>(s);
+      auto in_aux = input.aux_data(rowsparse::kIdx).FlatTo1D<xpu, AuxType>(s);
       ASSIGN_DISPATCH(out_d, req[0],
-                      F<mshadow_op::identity>(inputs[1].data().FlatTo1D<xpu, DType>(s)));
+                      F<mshadow_op::identity>(input.data().FlatTo1D<xpu, DType>(s)));
       ASSIGN_DISPATCH(out_aux, req[0], F<mshadow_op::identity>(in_aux));
     });
   });
 }
 
-// FIXME the index is hard coded for _identity_with_attr_like_rhs op
 template<typename xpu>
-void IdentityComputeEx(const nnvm::NodeAttrs& attrs,
+void IdentityLikeRhsComputeEx(const nnvm::NodeAttrs& attrs,
                      const OpContext& ctx,
                      const std::vector<NDArray>& inputs,
                      const std::vector<OpReqType>& req,
@@ -103,9 +98,13 @@ void IdentityComputeEx(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(inputs.size(), 2);
   CHECK_EQ(outputs.size(), 1);
   Stream<xpu> *s = ctx.get_stream<xpu>();
-  NDArrayStorageType stype = inputs[1].storage_type();
-  CHECK_EQ(stype, kRowSparseStorage) << "Not implemented yet";
-  IdentityComputeRsp<xpu>(attrs, ctx, inputs, req, outputs);
+  size_t rhs_idx = 1;
+  NDArrayStorageType stype = inputs[rhs_idx].storage_type();
+  if (stype == kRowSparseStorage) {
+    IdentityComputeRsp<xpu>(attrs, ctx, inputs, req, outputs);
+  } else {
+    LOG(FATAL) << "Not implemented yet";
+  }
 }
 
 struct CastParam : public dmlc::Parameter<CastParam> {
@@ -199,12 +198,11 @@ struct FillRspRowIdx {
  * Will revisit this interface in the future.
  */
 template<typename xpu>
-void CastStorageDnsRspImpl(const OpContext& ctx, const TBlob& dns, NDArray* rsp) {
+void CastStorageDnsRspImpl(mshadow::Stream<xpu> *s, const TBlob& dns, NDArray* rsp) {
   CHECK(rsp != nullptr);
   CHECK_EQ(rsp->storage_type(), kRowSparseStorage);
   CHECK_EQ(dns.shape_, rsp->shape());
 
-  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   rsp->CheckAndAllocAuxData(rowsparse::kIdx, mshadow::Shape1(dns.shape_[0]));
   MSHADOW_TYPE_SWITCH(dns.type_flag_, DType, {  // data type
     NDARRAY_IDX_TYPE_SWITCH(rsp->aux_type(rowsparse::kIdx), RType, {  // row idx type
@@ -220,7 +218,10 @@ void CastStorageDnsRspImpl(const OpContext& ctx, const TBlob& dns, NDArray* rsp)
       for (index_t i = 0; i < num_rows; ++i) {
         if (row_idx[i] < static_cast<RType>(num_rows)) ++nnr;
       }
-      if (0 == nnr) return;  // zero matrix
+      if (0 == nnr) {
+        rsp->SetAuxShape(rowsparse::kIdx, TShape({0}));
+        return;  // zero matrix
+      }
       rsp->CheckAndAllocData(mshadow::Shape2(nnr, num_cols));
       // TODO(junwu): single thread for compressing row_idx and copying data
       // from dns to rsp, might be a bottleneck.
@@ -239,30 +240,47 @@ void CastStorageDnsRspImpl(const OpContext& ctx, const TBlob& dns, NDArray* rsp)
   });
 }
 
+// TODO(haibin) Use memcopy instead will be much faster than assigning each individual element
+struct CastStorageRspDnsKernel {
+  template<typename DType, typename IType>
+  MSHADOW_XINLINE static void Map(int i, const index_t width, const IType* idx, const DType *data,
+                                  DType* dns, const index_t invalid_rid) {
+    auto rid = idx[i];
+    // skip invalid rows
+    if (rid == invalid_rid) return;
+    auto dns_offset = rid * width;
+    auto rsp_offset = i * width;
+    for (size_t col = 0; col < width; col++) {
+      dns[dns_offset + col] = data[rsp_offset + col];
+    }
+  }
+};
+
+
 /*!
  * \brief This function assumes that the meomry for dns has been allocated already
  * since the shape is known at binding stage.
  */
 template<typename xpu>
-void CastStorageRspDnsImpl(const OpContext& ctx, const NDArray& rsp, TBlob* dns) {
+void CastStorageRspDnsImpl(mshadow::Stream<xpu> *s, const NDArray& rsp, TBlob* dns) {
   using namespace mshadow;
   using namespace mshadow::expr;
-  Stream<xpu> *s = ctx.get_stream<xpu>();
   CHECK_EQ(rsp.storage_type(), kRowSparseStorage);
   MSHADOW_TYPE_SWITCH(dns->type_flag_, DType, {
-    MSHADOW_TYPE_SWITCH(rsp.aux_type(rowsparse::kIdx), IType, {
+    NDARRAY_IDX_TYPE_SWITCH(rsp.aux_type(rowsparse::kIdx), IType, {
       // assign zeros
       mxnet_op::Kernel<mxnet_op::set_zero, xpu>::Launch(s, dns->Size(), dns->dptr<DType>());
-      // data() is not empty
-      if (rsp.storage_shape().ndim() != 0) {
-        // Copy over
-        auto in_data = rsp.data().FlatTo2D<xpu, DType>(s);
-        auto out_data = dns->FlatTo2D<xpu, DType>(s);
-        auto num_rows = rsp.aux_shape(rowsparse::kIdx)[0];
-        auto in_idx = rsp.aux_data(rowsparse::kIdx).FlatTo1D<xpu, IType>(s);
-        for (size_t i = 0; i < num_rows; i += 1) {
-          mshadow::Copy(out_data[in_idx[i]], in_data[i], s);
-        }
+      if (rsp.storage_initialized()) {
+        // copy over row by row
+        auto in_idx = rsp.aux_data(rowsparse::kIdx).FlatTo1D<xpu, IType>(s).dptr_;
+        auto in_data = rsp.data().FlatTo2D<xpu, DType>(s).dptr_;
+        auto out_data = dns->FlatTo2D<xpu, DType>(s).dptr_;
+        auto num_rows = rsp.aux_shape(rowsparse::kIdx).Size();
+        auto rsp_shape = rsp.shape();
+        auto invalid_rid = rsp_shape[0];
+        auto width = rsp_shape.ProdShape(1, rsp_shape.ndim());
+        mxnet_op::Kernel<CastStorageRspDnsKernel, xpu>::Launch(s, num_rows, width, in_idx, in_data,
+                                                               out_data, invalid_rid);
       }
     });
   });
@@ -337,13 +355,12 @@ struct FillCsrColIdxAndVals {
  * Will revisit this interface in the future.
  */
 template<typename xpu>
-void CastStorageDnsCsrImpl(const OpContext& ctx, const TBlob& dns, NDArray* csr) {
+void CastStorageDnsCsrImpl(mshadow::Stream<xpu> *s, const TBlob& dns, NDArray* csr) {
   CHECK(csr != nullptr);
   CHECK_EQ(csr->storage_type(), kCSRStorage);
   CHECK_EQ(dns.shape_.ndim(), 2);
   CHECK_EQ(dns.shape_, csr->shape());
 
-  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   MSHADOW_TYPE_SWITCH(dns.type_flag_, DType, {  // data type
     NDARRAY_IDX_TYPE_SWITCH(csr->aux_type(csr::kIndPtr), IType, {  // indptr type
       NDARRAY_IDX_TYPE_SWITCH(csr->aux_type(csr::kIdx), CType, {  // col idx type
@@ -408,13 +425,12 @@ struct CopyCsrDataToDns {
  * Will revisit this interface in the future.
  */
 template<typename xpu>
-void CastStorageCsrDnsImpl(const OpContext& ctx, const NDArray& csr, TBlob* dns) {
+void CastStorageCsrDnsImpl(mshadow::Stream<xpu> *s, const NDArray& csr, TBlob* dns) {
   CHECK(dns != nullptr);
   CHECK_EQ(csr.storage_type(), kCSRStorage);
   CHECK_EQ(dns->shape_.ndim(), 2);
   CHECK_EQ(dns->shape_, csr.shape());
 
-  mshadow::Stream<xpu> *s = ctx.get_stream<xpu>();
   MSHADOW_TYPE_SWITCH(dns->type_flag_, DType, {  // data type
     NDARRAY_IDX_TYPE_SWITCH(csr.aux_type(csr::kIndPtr), IType, {  // indptr type
       NDARRAY_IDX_TYPE_SWITCH(csr.aux_type(csr::kIdx), CType, {  // col idx type
@@ -422,6 +438,7 @@ void CastStorageCsrDnsImpl(const OpContext& ctx, const NDArray& csr, TBlob* dns)
         const index_t num_cols = dns->shape_[1];
         DType* dns_data = dns->dptr<DType>();
         mxnet_op::Kernel<mxnet_op::set_zero, xpu>::Launch(s, dns->shape_.Size(), dns_data);
+        if (!csr.storage_initialized()) return;
         const IType* indptr = csr.aux_data(csr::kIndPtr).dptr<IType>();
         const CType* col_idx = csr.aux_data(csr::kIdx).dptr<CType>();
         const DType* csr_data = csr.data().dptr<DType>();
@@ -446,6 +463,30 @@ inline bool CastStorageInferStorageType(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
+template<typename xpu>
+void CastStorageComputeImpl(mshadow::Stream<xpu> *s,
+                          const NDArray& input,
+                          const NDArray& output) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  const auto src_stype = input.storage_type();
+  const auto dst_stype = output.storage_type();
+  if (src_stype == kRowSparseStorage && dst_stype == kDefaultStorage) {
+    TBlob ret = output.data();
+    CastStorageRspDnsImpl<xpu>(s, input, &ret);
+  } else if (src_stype == kDefaultStorage && dst_stype == kRowSparseStorage) {
+    NDArray ret = output;  // get rid of the const qualifer
+    CastStorageDnsRspImpl<xpu>(s, input.data(), &ret);
+  } else if (src_stype == kDefaultStorage && dst_stype == kCSRStorage) {
+    NDArray ret = output;  // get rid of the const qualifer
+    CastStorageDnsCsrImpl<xpu>(s, input.data(), &ret);
+  } else if (src_stype == kCSRStorage && dst_stype == kDefaultStorage) {
+    TBlob ret = output.data();
+    CastStorageCsrDnsImpl<xpu>(s, input, &ret);
+  } else {
+    LOG(FATAL) << "Not implemented";
+  }
+}
 template<typename xpu>
 void CastStorageComputeEx(const nnvm::NodeAttrs& attrs,
                           const OpContext& ctx,
@@ -457,23 +498,7 @@ void CastStorageComputeEx(const nnvm::NodeAttrs& attrs,
   Stream<xpu> *s = ctx.get_stream<xpu>();
   CHECK_EQ(inputs.size(), 1);
   CHECK_EQ(outputs.size(), 1);
-  const auto src_stype = inputs[0].storage_type();
-  const auto dst_stype = outputs[0].storage_type();
-  if (src_stype == kRowSparseStorage && dst_stype == kDefaultStorage) {
-    TBlob ret = outputs[0].data();
-    CastStorageRspDnsImpl<xpu>(ctx, inputs[0], &ret);
-  } else if (src_stype == kDefaultStorage && dst_stype == kRowSparseStorage) {
-    NDArray ret = outputs[0];  // get rid of the const qualifer
-    CastStorageDnsRspImpl<xpu>(ctx, inputs[0].data(), &ret);
-  } else if (src_stype == kDefaultStorage && dst_stype == kCSRStorage) {
-    NDArray ret = outputs[0];  // get rid of the const qualifer
-    CastStorageDnsCsrImpl<xpu>(ctx, inputs[0].data(), &ret);
-  } else if (src_stype == kCSRStorage && dst_stype == kDefaultStorage) {
-    TBlob ret = outputs[0].data();
-    CastStorageCsrDnsImpl<xpu>(ctx, inputs[0], &ret);
-  } else {
-    LOG(FATAL) << "Not implemented";
-  }
+  CastStorageComputeImpl(s, inputs[0], outputs[0]);
 }
 
 #define MXNET_OPERATOR_REGISTER_UNARY(name)                         \
diff --git a/src/operator/tensor/indexing_op.cc b/src/operator/tensor/indexing_op.cc
index f9023054a10f..fed4b4dd229b 100644
--- a/src/operator/tensor/indexing_op.cc
+++ b/src/operator/tensor/indexing_op.cc
@@ -86,6 +86,40 @@ NNVM_REGISTER_OP(_backward_Embedding)
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
 .set_attr<FCompute>("FCompute<cpu>", EmbeddingOpBackward<cpu>);
 
+NNVM_REGISTER_OP(SparseEmbedding)
+.describe(R"code(Maps integer indices to vector representations (embeddings) with sparse weight update
+)code" ADD_FILELINE)
+.set_num_inputs(2)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<EmbeddingParam>)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+  [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"data", "weight"};
+  })
+.set_attr<nnvm::FInferShape>("FInferShape", EmbeddingOpShape)
+.set_attr<nnvm::FInferType>("FInferType", EmbeddingOpType)
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+.set_attr<FCompute>("FCompute<cpu>", EmbeddingOpForward<cpu>)
+.set_attr<nnvm::FGradient>("FGradient",
+  [](const nnvm::NodePtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
+    return MakeNonlossGradNode("_backward_SparseEmbedding", n, ograds,
+                               {n->inputs[0]}, n->attrs.dict);
+  })
+.add_argument("data", "NDArray-or-Symbol", "The input array to the embedding operator.")
+.add_argument("weight", "NDArray-or-Symbol", "The embedding weight matrix.")
+.add_arguments(EmbeddingParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_backward_SparseEmbedding)
+.set_num_inputs(2)
+.set_num_outputs(2)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<nnvm::FInferStorageType>("FInferStorageType", SparseEmbeddingBackwardStorageType)
+.set_attr<FComputeEx>("FComputeEx<cpu>", SparseEmbeddingOpBackwardEx<cpu>);
+// TODO(haibin) handle dense case
+// .set_attr<FCompute>("FCompute<cpu>", EmbeddingOpBackward<cpu>);
 
 NNVM_REGISTER_OP(take)
 .describe(R"code(Takes elements from an input array along the given axis.
diff --git a/src/operator/tensor/indexing_op.h b/src/operator/tensor/indexing_op.h
index 041ceab8a679..944adb4ace8d 100755
--- a/src/operator/tensor/indexing_op.h
+++ b/src/operator/tensor/indexing_op.h
@@ -315,6 +315,133 @@ void EmbeddingOpBackward(const nnvm::NodeAttrs& attrs,
   });
 }
 
+// todo template req
+struct SparseEmbeddingBackwardRsp {
+  template<typename DType, typename IType>
+  // size_t?
+  // i for each thread
+  // each thread is responsible for rows in output in [segment_start, segment_end)
+  MSHADOW_XINLINE static void Map(int i, const size_t width, IType* dst_idx, DType* dst_val,
+                                  const IType* idx, const size_t num_idx, const DType* src,
+                                  const size_t segment_len, const size_t num_rows, OpReqType req) {
+    size_t segment_start = i * segment_len;
+    size_t segment_end = (i + 1) * segment_len;
+    for (size_t y = 0; y < num_idx; y++) {
+      size_t j = idx[y];
+      if (j > num_rows) j = num_rows - 1;
+      if (j < segment_start || j >= segment_end) continue;
+      dst_idx[j] = j;
+      for (size_t k = 0; k < width; k++) {
+      if (req == kWriteTo) { req = kAddTo;}
+      KERNEL_ASSIGN(dst_val[j * width + k], req, src[y * width + k]);
+      }
+    }
+  }
+};
+
+inline bool SparseEmbeddingBackwardStorageType(const nnvm::NodeAttrs& attrs,
+                         std::vector<int> *in_attrs,
+                         std::vector<int> *out_attrs) {
+  CHECK_EQ((*in_attrs)[0], kDefaultStorage);
+  CHECK_EQ((*in_attrs)[1], kDefaultStorage);
+  (*out_attrs)[0] = kRowSparseStorage;
+  return true;
+}
+
+// todo replace xpu with cpu
+template<typename xpu>
+void SparseEmbeddingOpBackwardDnsDnsRsp(const nnvm::NodeAttrs& attrs,
+                               const OpContext& ctx,
+                               const std::vector<NDArray>& inputs,
+                               const std::vector<OpReqType>& req,
+                               const std::vector<NDArray>& outputs) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+  using namespace mshadow::expr;
+  CHECK_EQ(inputs.size(), 2U);
+  CHECK_EQ(outputs.size(), 2U);
+  // CHECK_EQ(req[embedding::kData], kNullOp)
+  //       << "Embedding layer doesn't support calculate data gradient" << req[0] << " " << req[1];
+  // idx shape (d1, d2 .. dk)
+  auto idx = inputs[1];
+  // grad shape (d1, d2, .. dk, out_dim)
+  auto grad = inputs[0];
+  // weight shape (in_dim, out_dim)
+  auto output = outputs[1];
+  CHECK_EQ(idx.storage_type(), kDefaultStorage);
+  CHECK_EQ(grad.storage_type(), kDefaultStorage);
+  CHECK_EQ(output.dtype(), grad.dtype());
+  CHECK_EQ(idx.dtype(), output.aux_type(rowsparse::kIdx)) << "Index type doesn't match";
+
+  const TShape& ishape = idx.shape();
+  const TShape& oshape = grad.shape();
+
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  CHECK_EQ(idx.dtype(), output.aux_type(rowsparse::kIdx))
+           << "embedding input index and gradient row sparse type doesn't match!";
+  // Alloc dense
+  unsigned int num_rows = output.shape()[0];
+  output.CheckAndAlloc({TShape({num_rows})});
+  MSHADOW_TYPE_SWITCH(output.dtype(), DType, {
+    MSHADOW_TYPE_SWITCH(idx.dtype(), IType, {
+      // Assuming aux_type == IType for now
+      // idx_data shape (d1 * d2 * .. dk)
+      // input embedding indice (d1 * d2 * .. dk), each idx in [0, input_dim)
+      auto idx_data = idx.data().FlatTo1D<xpu, IType>(s);
+      // grad_data shape (d1 * d2 * .. dk, out_dim)
+      auto grad_data = grad.data().get_with_shape<xpu, 2, DType>(
+        Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), s);
+      // output shape (in_dim, out_dim)
+      auto output_idx = output.aux_data(rowsparse::kIdx).FlatTo1D<xpu, IType>(s);
+      auto output_val = output.data().FlatTo2D<xpu, DType>(s);
+      int num_threads = 64;
+      // dim_1 .. dim_k
+      size_t width = output.shape()[1];
+      size_t segment_len = (num_rows + num_threads - 1) / num_threads;
+      // TODO(refactor me) fill with invalid values
+      for (size_t i = 0; i < num_rows; i++) {
+        output_idx.dptr_[i] = num_rows;
+      }
+       // fill 0
+      for (size_t i = 0; i < output_val.shape_.Size(); i++) {
+        output_val.dptr_[i] = 0;
+      }
+      Kernel<SparseEmbeddingBackwardRsp, xpu>::Launch(s, num_threads, width, output_idx.dptr_,
+                                                      output_val.dptr_, idx_data.dptr_,
+                                                      ishape.Size(), grad_data.dptr_, segment_len,
+                                                      num_rows, req[1]);
+    });
+  });
+}
+
+// todo replace xpu with cpu
+template<typename xpu>
+void SparseEmbeddingOpBackwardEx(const nnvm::NodeAttrs& attrs,
+                               const OpContext& ctx,
+                               const std::vector<NDArray>& inputs,
+                               const std::vector<OpReqType>& req,
+                               const std::vector<NDArray>& outputs) {
+  using namespace mshadow;
+  using namespace mxnet_op;
+  using namespace mshadow::expr;
+  CHECK_EQ(inputs.size(), 2U);
+  CHECK_EQ(outputs.size(), 2U);
+  // CHECK_EQ(req[embedding::kData], kNullOp)
+  //       << "Embedding layer doesn't support calculate data gradient" << req[0] << " " << req[1];
+  // idx shape (d1, d2 .. dk)
+  auto idx_stype = inputs[1].storage_type();
+  // grad shape (d1, d2, .. dk, out_dim)
+  auto grad_stype = inputs[0].storage_type();
+  // weight shape (in_dim, out_dim)
+  auto output_stype = outputs[1].storage_type();
+  if (idx_stype == kDefaultStorage && grad_stype == kDefaultStorage &&
+      output_stype == kRowSparseStorage) {
+    SparseEmbeddingOpBackwardDnsDnsRsp<xpu>(attrs, ctx, inputs, req, outputs);
+  } else {
+    LOG(FATAL) << "Not implemented";
+  }
+}
+
 namespace take_ {  // to avoid name conflict
 enum TakeOpInputs {kArr, kIdx};
 enum TakeOpOutputs {kOut};
diff --git a/src/operator/tensor/init_op.cc b/src/operator/tensor/init_op.cc
index b091cbca2d9f..ddee0c7a46f5 100644
--- a/src/operator/tensor/init_op.cc
+++ b/src/operator/tensor/init_op.cc
@@ -21,7 +21,7 @@ NNVM_REGISTER_OP(_zeros)
 .set_attr<nnvm::FInferShape>("FInferShape", InitShape<InitOpParam>)
 .set_attr<nnvm::FInferType>("FInferType", InitType<InitOpParam>)
 .set_attr<FCompute>("FCompute<cpu>", FillCompute<cpu, 0>)
-.set_attr<FComputeEx>(FCOMP_EX_CPU, FillComputeEx<cpu, 0>)
+.set_attr<FComputeEx>(FCOMP_EX_CPU, FillComputeZerosEx<cpu>)
 .add_arguments(InitOpParam::__FIELDS__());
 
 NNVM_REGISTER_OP(_ones)
diff --git a/src/operator/tensor/init_op.cu b/src/operator/tensor/init_op.cu
index a798f26db60d..bcb10f70b3c3 100644
--- a/src/operator/tensor/init_op.cu
+++ b/src/operator/tensor/init_op.cu
@@ -9,7 +9,8 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(_zeros)
-.set_attr<FCompute>("FCompute<gpu>", FillCompute<gpu, 0>);
+.set_attr<FCompute>("FCompute<gpu>", FillCompute<gpu, 0>)
+.set_attr<FComputeEx>(FCOMP_EX_GPU, FillComputeZerosEx<gpu>);
 
 NNVM_REGISTER_OP(_ones)
 .set_attr<FCompute>("FCompute<gpu>", FillCompute<gpu, 1>);
diff --git a/src/operator/tensor/init_op.h b/src/operator/tensor/init_op.h
index 1c96c1f2cf5f..2339282a6740 100644
--- a/src/operator/tensor/init_op.h
+++ b/src/operator/tensor/init_op.h
@@ -15,6 +15,8 @@
 #include <string>
 #include <limits>
 #include "../elemwise_op_common.h"
+#include "../mxnet_op.h"
+
 
 namespace mxnet {
 namespace op {
@@ -126,8 +128,31 @@ void FillCompute(const nnvm::NodeAttrs& attrs,
   });
 }
 
-template<typename xpu, int value>
-void FillComputeEx(const nnvm::NodeAttrs& attrs,
+// Fill a rsp NDArray with zeros by updating the aux shape.
+template<typename xpu>
+void FillZerosRspImpl(mshadow::Stream<xpu> *s, NDArray *dst) {
+  if (!dst->storage_initialized()) return;
+  // reset the shapes if it's not zeros
+  auto storage_shape = dst->storage_shape();
+  storage_shape[0] = 0;
+  dst->SetAuxShape(rowsparse::kIdx, TShape({0}));
+  dst->SetStorageShape(storage_shape);
+}
+
+// Fill a CSR NDArray with zeros by updating the aux shape.
+template<typename xpu>
+void FillZerosCsrImpl(mshadow::Stream<xpu> *s, NDArray *dst) {
+  if (!dst->storage_initialized()) return;
+  // reset the shapes if it's not zeros
+  TShape new_shape({0});
+  dst->SetAuxShape(csr::kIndPtr, new_shape);
+  dst->SetAuxShape(csr::kIdx, new_shape);
+  dst->SetStorageShape(new_shape);
+}
+
+// This operator never needs to fall back, since there's no input NDArray
+template<typename xpu>
+void FillComputeZerosEx(const nnvm::NodeAttrs& attrs,
                  const OpContext& ctx,
                  const std::vector<NDArray>& inputs,
                  const std::vector<OpReqType>& req,
@@ -138,7 +163,15 @@ void FillComputeEx(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(outputs.size(), 1);
   CHECK_EQ(inputs.size(), 0);
   auto stype = outputs[0].storage_type();
-  CHECK_EQ(value, 0) << "Not implemented yet";
+  if (stype == kRowSparseStorage) {
+    NDArray nd(outputs[0]);
+    FillZerosRspImpl<xpu>(s, &nd);
+  } else if (stype == kCSRStorage) {
+    NDArray nd(outputs[0]);
+    FillZerosCsrImpl<xpu>(s, &nd);
+  } else {
+    LOG(FATAL) << "storage type not implemented.";
+  }
 }
 
 template<typename xpu>
diff --git a/src/operator/tensor/matrix_op-inl.h b/src/operator/tensor/matrix_op-inl.h
index 0b53f11316b0..85adf85f0904 100644
--- a/src/operator/tensor/matrix_op-inl.h
+++ b/src/operator/tensor/matrix_op-inl.h
@@ -619,11 +619,13 @@ void DotCsrDnsDnsImpl(const OpContext& ctx,
               s, data_out.Size(), data_out.dptr<DType>());
         }
         if (trans_lhs) {
+          if (!lhs.storage_initialized()) return;
           mxnet_op::Kernel<DotCsrDnsDns<true, false>, xpu>::Launch(s, data_out.Size(),
               data_out.dptr<DType>(), data_l.dptr<DType>(), indptr_l.dptr<IType>(),
               col_idx_l.dptr<CType>(), data_r.dptr<DType>(), lhs.shape()[0],
               rhs.shape()[1]);
         } else {
+          if (!lhs.storage_initialized()) return;
           mxnet_op::Kernel<DotCsrDnsDns<false, false>, xpu>::Launch(s, data_out.Size(),
               data_out.dptr<DType>(), data_l.dptr<DType>(), indptr_l.dptr<IType>(),
               col_idx_l.dptr<CType>(), data_r.dptr<DType>(), rhs.shape()[1]);
@@ -767,14 +769,15 @@ void DotCsrDnsRspImpl(const OpContext& ctx,
               mxnet_op::Kernel<mxnet_op::set_zero, xpu>::Launch(
                   s, out_tmp.shape_.Size(), out_tmp.dptr_);
             }
-            // generate a temporary dns output
-            mxnet_op::Kernel<DotCsrDnsDns<true, false>, xpu>::Launch(
-                s, out_tmp.shape_.Size(), out_tmp.dptr_, data_l.dptr<DType>(),
-                indptr_l.dptr<IType>(), col_idx_l.dptr<CType>(), data_r.dptr<DType>(),
-                lhs.shape()[0], out_tmp.shape_[1]);
-
+            if (lhs.storage_initialized()) {
+              // generate a temporary dns output
+              mxnet_op::Kernel<DotCsrDnsDns<true, false>, xpu>::Launch(
+                  s, out_tmp.shape_.Size(), out_tmp.dptr_, data_l.dptr<DType>(),
+                  indptr_l.dptr<IType>(), col_idx_l.dptr<CType>(), data_r.dptr<DType>(),
+                  lhs.shape()[0], out_tmp.shape_[1]);
+            }
             // cast dns to rsp
-            CastStorageDnsRspImpl<xpu>(ctx, TBlob(out_tmp), ret);
+            CastStorageDnsRspImpl<xpu>(s, TBlob(out_tmp), ret);
           } else {
             // TODO(junwu): check whether the following code is a bottleneck
             // allocate output NDArray (single thread)
@@ -859,6 +862,17 @@ void DotBackwardCsrDnsRsp(const nnvm::NodeAttrs& attrs,
   DotCsrDnsRspImpl<xpu>(ctx, inputs[1], inputs[0], req[1], !param.transpose_a, &ret);
 }
 
+template<typename xpu>
+void DotBackwardCsrDnsDns(const nnvm::NodeAttrs& attrs,
+                          const OpContext& ctx,
+                          const std::vector<NDArray>& inputs,
+                          const std::vector<OpReqType>& req,
+                          const std::vector<NDArray>& outputs) {
+  const DotParam& param = nnvm::get<DotParam>(attrs.parsed);
+  NDArray ret = outputs[1];
+  DotCsrDnsDnsImpl<xpu>(ctx, inputs[1], inputs[0], req[1], !param.transpose_a, &ret);
+}
+
 template<typename xpu>
 void DotBackwardEx(const nnvm::NodeAttrs& attrs,
                    const OpContext& ctx,
@@ -875,12 +889,18 @@ void DotBackwardEx(const nnvm::NodeAttrs& attrs,
   // TODO(junwu): check whether this CHECK is reasonable
   const DotParam& param = nnvm::get<DotParam>(attrs.parsed);
   CHECK(!param.transpose_b) << "sparse dot only supports dot(A, X) and dot(A.T(), X)";
-
   if (inputs[0].storage_type() == kDefaultStorage  // ograd dns format
+      // dns, csr, dns => *, rsp
       && inputs[1].storage_type() == kCSRStorage  // csr input lhs of the op
       && inputs[2].storage_type() == kDefaultStorage  // dns input rhs of the op
       && outputs[1].storage_type() == kRowSparseStorage) {  // grad(rhs) rsp format
     DotBackwardCsrDnsRsp<xpu>(attrs, ctx, inputs, req, outputs);
+  } else if (inputs[0].storage_type() == kDefaultStorage  // ograd dns format
+      // dns, csr, dns => *, dns
+      && inputs[1].storage_type() == kCSRStorage  // csr input lhs of the op
+      && inputs[2].storage_type() == kDefaultStorage  // dns input rhs of the op
+      && outputs[1].storage_type() == kDefaultStorage) {  // grad(rhs) dns format
+    DotBackwardCsrDnsDns<xpu>(attrs, ctx, inputs, req, outputs);
   } else {
     // TODO(junwu): add fallback mechanism
     LOG(FATAL) << "Not supported";
diff --git a/src/operator/tensor/matrix_op.cc b/src/operator/tensor/matrix_op.cc
index 4d8e30b46dc7..b3829f80c559 100644
--- a/src/operator/tensor/matrix_op.cc
+++ b/src/operator/tensor/matrix_op.cc
@@ -382,6 +382,10 @@ NNVM_REGISTER_OP(_backward_dot)
 .set_attr_parser(ParamParser<DotParam>)
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
 .set_attr<nnvm::FInferStorageType>("FInferStorageType", DotBackwardInferStorageType)
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
 .set_attr<FCompute>("FCompute<cpu>", DotBackward_<cpu>)
 .set_attr<FComputeEx>("FComputeEx<cpu>", DotBackwardEx<cpu>)
 .add_arguments(DotParam::__FIELDS__());
diff --git a/src/operator/tensor/matrix_op.cu b/src/operator/tensor/matrix_op.cu
index d10b0ccc46c2..2e1effb9e560 100644
--- a/src/operator/tensor/matrix_op.cu
+++ b/src/operator/tensor/matrix_op.cu
@@ -44,7 +44,9 @@ NNVM_REGISTER_OP(dot)
 .set_attr<FComputeEx>("FComputeEx<gpu>", DotForwardEx<gpu>);
 
 NNVM_REGISTER_OP(_backward_dot)
-.set_attr<FCompute>("FCompute<gpu>", DotBackward_<gpu>);
+.set_attr<FCompute>("FCompute<gpu>", DotBackward_<gpu>)
+.set_attr<FComputeEx>("FComputeEx<gpu>", DotBackwardEx<gpu>);
+
 
 NNVM_REGISTER_OP(batch_dot)
 .set_attr<FCompute>("FCompute<gpu>", BatchDotForward_<gpu>);
diff --git a/tests/cpp/ndarray_test.cc b/tests/cpp/ndarray_test.cc
index 7954eaa32ffc..e76ea1f42dfe 100644
--- a/tests/cpp/ndarray_test.cc
+++ b/tests/cpp/ndarray_test.cc
@@ -9,174 +9,43 @@
 #include "../src/executor/graph_executor.h"
 #include "../src/operator/tensor/elemwise_binary_op.h"
 #include "../src/operator/tensor/elemwise_unary_op.h"
+#include "../src/operator/tensor/indexing_op.h"
 #include "../src/operator/optimizer_op-inl.h"
+#include "../src/operator/tensor/init_op.h"
+#include "test_utils.h"
 
-#define TEST_DTYPE float
-#define TEST_ITYPE int32_t
 using namespace mxnet;
 
-// TODO(haibin) these functions should be put in test_util.h
-void CheckDataRegion(const TBlob &src, const TBlob &dst) {
-  auto size = src.shape_.Size() * mshadow::mshadow_sizeof(src.type_flag_);
-  auto equals = memcmp(src.dptr_, dst.dptr_, size);
-  EXPECT_EQ(equals, 0);
-}
-
-NDArray GetIndexND(const TShape shape, const Context ctx, const std::vector<TEST_ITYPE> &values) {
-  NDArray nd(shape, ctx, false, ROW_SPARSE_IDX_TYPE);
-  size_t num_val = values.size();
-  MSHADOW_TYPE_SWITCH(nd.dtype(), DType, {
-    auto tensor = nd.data().FlatTo1D<cpu, DType>();
-    for (size_t i = 0; i < num_val; i++) {
-      tensor[i] = values[i];
-    }
-  });
-  return nd;
-}
-
-NDArray GetDenseND(const TShape shape, const Context ctx, const std::vector<TEST_DTYPE> &values) {
-  NDArray nd(shape, ctx, false);
-  size_t num_val = values.size();
-  CHECK_EQ(num_val, nd.shape().ProdShape(0, nd.shape().ndim()));
-  MSHADOW_TYPE_SWITCH(nd.dtype(), DType, {
-    auto tensor = nd.data().FlatTo1D<cpu, DType>();
-    for (size_t i = 0; i < num_val; i++) {
-      tensor[i] = values[i];
-    }
-  });
-  return nd;
-}
-
-NDArray GetRspND(const TShape shape, const Context ctx, const std::vector<TEST_ITYPE> idx,
-                 const std::vector<TEST_DTYPE> vals) {
-  index_t num_rows = idx.size();
-  index_t num_cols = vals.size() / idx.size();
-  NDArray index = GetIndexND(TShape({num_rows}), ctx, idx);
-  CHECK_EQ(vals.size() % idx.size(), 0);
-  NDArray raw_data = GetDenseND(TShape({num_rows, num_cols}), ctx, vals);
-  NDArray nd(raw_data, {index}, ctx, kRowSparseStorage, shape);
-  return nd;
-}
-
-NDArray Convert(NDArrayStorageType type, NDArray src) {
-  CHECK_EQ(type, kDefaultStorage);
-  NDArray converted(src.shape(), src.ctx(), false);
-  Engine::Get()->PushSync([src, converted](RunContext ctx) {
-      // TODO provide type in attrs, which is empty now
-      OpContext op_ctx;
-      op_ctx.run_ctx = ctx;
-      if (src.storage_type() == kRowSparseStorage) {
-        std::vector<NDArray> inputs({src}), outputs({converted});
-        op::CastStorageComputeEx<cpu>({}, op_ctx, inputs, {}, outputs);
-      } else if (src.storage_type() == kDefaultStorage) {
-        std::vector<TBlob> inputs({src.data()}), outputs({converted.data()});
-        op::IdentityCompute<cpu>({}, op_ctx, inputs, {kWriteTo}, outputs);
-      } else {
-        LOG(FATAL) << "unsupported storage type";
-      }
-    }, src.ctx(), {src.var()}, {converted.var()},
-    FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
-  converted.WaitToRead();
-  return converted;
-}
-
-// Operators
-void BinaryDenseSparseTest() {
-  Context ctx = Context::CPU();
-
-  TShape output_shape({3, 2});
-  NDArray input_nd0 = GetRspND(output_shape, ctx, {0, 1}, {10, 10, 10, 10});
-  NDArray input_nd1 = GetDenseND(output_shape, ctx, {1, 2, 3, 4, 5, 6});
-  NDArray output(kRowSparseStorage, output_shape, ctx);
-
-  std::vector<Engine::VarHandle> const_vars;
-  const_vars.push_back(input_nd0.var());
-  const_vars.push_back(input_nd1.var());
-  Engine::Get()->PushSync([input_nd0, input_nd1, output](RunContext ctx) {
-      nnvm::NodeAttrs attrs;
-      OpContext op_ctx;
-      std::vector<NDArray> inputs, outputs;
-      std::vector<OpReqType> req;
-      inputs.push_back(input_nd0);
-      inputs.push_back(input_nd1);
-      outputs.push_back(output);
-      op::BinaryComputeEx<cpu, mshadow::op::plus>(attrs, op_ctx, inputs, req, outputs);
-    }, input_nd0.ctx(), const_vars, {output.var()},
-    FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
-  std::vector<TEST_DTYPE> output_vals({11, 12, 3, 4, 15, 16});
-  NDArray out_data = GetDenseND(output_shape, ctx, output_vals);
-  Engine::Get()->WaitForAll();
-  CheckDataRegion(out_data.data(), output.data());
-  // TODO(haibin) also check with zeros..
-}
-
-void BinaryRsRsTest() {
-  Context ctx = Context::CPU();
-
-  TShape index_shape({2});
-  NDArray index0 = GetIndexND(index_shape, ctx, {0, 1});
-  NDArray index1 = GetIndexND(index_shape, ctx, {0, 2});
-
-  TShape data_shape({2, 2});
-  NDArray raw_data0 = GetDenseND(data_shape, ctx, {10, 10, 10, 10});
-  NDArray raw_data1 = GetDenseND(data_shape, ctx, {5, 5, 5, 5});
-
-  NDArray input_nd0(raw_data0, {index0}, ctx, kRowSparseStorage, data_shape);
-  NDArray input_nd1(raw_data1, {index1}, ctx, kRowSparseStorage, data_shape);
-
-  TShape output_shape({4, 2});
-  NDArray output(kRowSparseStorage, output_shape, ctx);
-  std::vector<Engine::VarHandle> const_vars;
-  const_vars.push_back(input_nd0.var());
-  const_vars.push_back(input_nd1.var());
-
-  Engine::Get()->PushSync([input_nd0, input_nd1, output](RunContext ctx) {
-      OpContext op_ctx;
-      std::vector<NDArray> inputs, outputs;
-      std::vector<OpReqType> req;
-      inputs.push_back(input_nd0);
-      inputs.push_back(input_nd1);
-      outputs.push_back(output);
-      op::BinaryComputeRspRsp<cpu, cpu>({}, op_ctx, inputs, req, outputs);
-    }, input_nd0.ctx(), const_vars, {output.var()},
-    FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
-
-
-  // Check the data region of output ndarray
-  NDArray dense_output = GetDenseND(output_shape, ctx, {15, 15, 10, 10, 5, 5, 0, 0});
-  NDArray copy = Convert(kDefaultStorage, output);
-  CheckDataRegion(input_nd0.data(), raw_data0.data());
-  CheckDataRegion(input_nd1.data(), raw_data1.data());
-  CheckDataRegion(dense_output.data(), copy.data());
-}
-
-// Conversion
-void DenseToDenseConversionTest() {
+// Conversion Tests
+void CastDnsDnsTest() {
   Context ctx;
   TShape shape({2, 2});
-  NDArray nd = GetDenseND(shape, ctx, {1, 2, 3, 10});
+  NDArray nd = DnsND(shape, ctx, {});
   auto nd_copy = Convert(kDefaultStorage, nd);
   CheckDataRegion(nd_copy.data(), nd.data());
 }
 
-void SparseToDenseConversionTest() {
+void CastRspDnsTest() {
   Context ctx;
   // Sparse ndarray
   TShape shape({2, 2});
-  NDArray nd = GetRspND(shape, ctx, {0}, {1, 1});
+  float v1 = RandFloat();
+  float v2 = RandFloat();
+  NDArray nd = RspND(shape, ctx, {0}, {v1, v2});
   // Dense ndarray
-  NDArray dense_nd = GetDenseND(shape, ctx, {1, 1, 0, 0});
+  NDArray dense_nd = DnsND(shape, ctx, {v1, v2, 0, 0});
   NDArray converted = Convert(kDefaultStorage, nd);
   CheckDataRegion(converted.data(), dense_nd.data());
 }
 
-// NDArray Function
+// NDArray function tests
 void SetValueTest() {
   Context ctx = Context::CPU();
   TShape data_shape({2, 2});
-  NDArray nd0 = GetDenseND(data_shape, ctx, {10, 10, 10, 10});
+  float v = RandFloat();
+  NDArray nd0 = DnsND(data_shape, ctx, {v, v, v, v});
   NDArray nd1(data_shape, ctx, false);
-  nd1 = 10;
+  nd1 = v;
   nd1.WaitToRead();
   CheckDataRegion(nd0.data(), nd1.data());
 }
@@ -184,28 +53,35 @@ void SetValueTest() {
 // InferStorage
 void InferElemwiseStorageTest() {
   nnvm::NodeAttrs attrs;
-  attrs.name = "Test op";
+  attrs.name = "test_op";
   std::vector<int> in_attrs({kRowSparseStorage, kDefaultStorage});
   std::vector<int> out_attrs({kUndefinedStorage});
-
+  // rsp, default -> default
   op::ElemwiseStorageType<2, 1>(attrs, &in_attrs, &out_attrs);
   EXPECT_EQ(out_attrs[0], kDefaultStorage);
+  // default, rsp -> default
   in_attrs = {kDefaultStorage, kRowSparseStorage};
   out_attrs = {kUndefinedStorage};
   op::ElemwiseStorageType<2, 1>(attrs, &in_attrs, &out_attrs);
   EXPECT_EQ(out_attrs[0], kDefaultStorage);
+  // rsp, rsp -> rsp
+  in_attrs = {kRowSparseStorage};
+  out_attrs = {kUndefinedStorage, kUndefinedStorage};
+  op::ElemwiseStorageType<1, 2>(attrs, &in_attrs, &out_attrs);
+  EXPECT_EQ(out_attrs[0], kRowSparseStorage);
+  EXPECT_EQ(out_attrs[1], kRowSparseStorage);
 }
 
 // Optimizer
 void SGDDnsRspTest() {
   TShape shape({4, 2});
   Context ctx = Context::CPU();
-  NDArray weight = GetDenseND(shape, ctx, {1, 2, 3, 4, 5, 6, 7, 8});
-  NDArray rsp_grad = GetRspND(shape, ctx, {0, 3}, {1, 2, 3, 4});
+  NDArray weight = DnsND(shape, ctx, {1, 2, 3, 4, 5, 6, 7, 8});
+  NDArray rsp_grad = RspND(shape, ctx, {0, 3}, {1, 2, 3, 4});
   NDArray output = weight;
-  float lr = 0.1;
-  float wd = 0.95;
-  float rescale = 2;
+  float lr = RandFloat();
+  float wd = RandFloat();
+  float rescale = RandFloat();
   op::SGDParam param;
   param.lr = lr;
   param.wd = wd;
@@ -214,37 +90,157 @@ void SGDDnsRspTest() {
   Engine::Get()->PushSync([weight, rsp_grad, output, param](RunContext ctx) {
       std::vector<NDArray> inputs{weight, rsp_grad}, outputs{output};
       std::vector<OpReqType> req({kAddTo});
-      op::SGDUpdateDnsRspImpl<cpu>(param, {}, inputs, req, outputs);
+      op::SparseSGDUpdateDnsRspImpl<cpu>(param, {}, inputs, req, outputs);
     }, weight.ctx(), {rsp_grad.var()}, {output.var()},
     FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
   auto sgd = [lr, wd, rescale] (TEST_DTYPE weight, TEST_DTYPE grad) {
      return (1.f-lr*wd)*weight - (lr*rescale)*grad;
     };
 
-  NDArray expected = GetDenseND(shape, ctx,
-                                {1 + sgd(1, 1), 2 + sgd(2, 2), 3, 4, 5, 6,
-                                 7 + sgd(7, 3), 8 + sgd(8, 4)});
+  NDArray expected = DnsND(shape, ctx,
+                           {1 + sgd(1, 1), 2 + sgd(2, 2), 3, 4, 5, 6,
+                           7 + sgd(7, 3), 8 + sgd(8, 4)});
   output.WaitToRead();
   CheckDataRegion(output.data(), expected.data());
 }
 
+void CopyFromToRspDnsTest() {
+  Context ctx;
+  // Sparse ndarray
+  TShape shape({2, 2});
+  NDArray nd = RspND(shape, ctx, {0}, {1, 1});
+  // Dense ndarray
+  NDArray dns_nd = DnsND(shape, ctx, {});
+  CopyFromTo(nd, &dns_nd);
+  dns_nd.WaitToRead();
+  CheckDataRegion(nd.data(), dns_nd.data());
+}
+
+void CopyFromToRspRspReuseTest() {
+  Context ctx;
+  // Sparse ndarray
+  TShape shape({3, 2});
+  NDArray nd = RspND(shape, ctx, {0}, {1,2});
+  // Sparse ndarray with enough memory. It's expected to reuse the memory
+  NDArray dst_nd = RspND(shape, ctx, {0, 1, 2}, {6,6,6,6,6,6});
+  nd.WaitToRead();
+  CopyFromTo(nd, &dst_nd);
+  dst_nd.WaitToRead();
+  CheckDataRegion(nd.data(), dst_nd.data());
+  CHECK_EQ(dst_nd.aux_shape(rowsparse::kIdx)[0], 1);
+  CHECK_EQ(dst_nd.storage_shape()[0], 1);
+  CHECK_EQ(dst_nd.storage_shape()[1], 2);
+}
+
+
+void CopyFromToRspRspFreeTest() {
+  Context ctx;
+  // Sparse ndarray
+  TShape shape({3, 2});
+  NDArray nd = RspND(shape, ctx, {0, 1}, {1,1,1,1});
+  // Sparse ndarray with enough memory. It's expected to reuse the memory
+  NDArray dst_nd = RspND(shape, ctx, {0}, {2,2});
+  nd.WaitToRead();
+  CopyFromTo(nd, &dst_nd);
+  dst_nd.WaitToRead();
+  CheckDataRegion(nd.data(), dst_nd.data());
+}
+
+void BinaryAddRspRsp() {
+  Context ctx = Context::CPU();
+
+  TShape output_shape({4, 2});
+  NDArray input_nd0 = RspND(output_shape, ctx, {0, 1}, {10,10,10,10});
+  NDArray input_nd1 = RspND(output_shape, ctx, {0, 2}, {5,5,5,5});
+
+  NDArray output(kRowSparseStorage, output_shape, ctx);
+  std::vector<Engine::VarHandle> const_vars;
+  const_vars.push_back(input_nd0.var());
+  const_vars.push_back(input_nd1.var());
+
+  Engine::Get()->PushSync([input_nd0, input_nd1, output](RunContext ctx) {
+      OpContext op_ctx;
+      std::vector<NDArray> inputs, outputs;
+      std::vector<OpReqType> req;
+      inputs.push_back(input_nd0);
+      inputs.push_back(input_nd1);
+      outputs.push_back(output);
+      op::BinaryComputeRspRsp<cpu, cpu>({}, op_ctx, inputs, req, outputs);
+    }, input_nd0.ctx(), const_vars, {output.var()},
+    FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
+
+  // Check the data region of output ndarray
+  NDArray dense_output = DnsND(output_shape, ctx, {15, 15, 10, 10, 5, 5, 0, 0});
+  NDArray copy = Convert(kDefaultStorage, output);
+  CheckDataRegion(dense_output.data(), copy.data());
+}
+
+void SparseEmbeddingBackwardTest() {
+  Context ctx = Context::CPU();
+  // d1 .. dk
+  // idx shape : (2, 3)
+  // input dim 4, output dim 2
+  int input_dim = 4;
+  int output_dim = 2;
+  TShape idx_shape({2, 3});
+  NDArray idx = RspIdxND(idx_shape, ctx, {1, 2, 3, 1, 2, 3});
+  TShape grad_shape({2, 3, 2});
+  NDArray grad = DnsND(grad_shape, ctx, {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2});
+  TShape out_shape({4, 2});
+  NDArray output = NDArray(kRowSparseStorage, out_shape, ctx);
+  op::EmbeddingParam param;
+  param.input_dim = input_dim;
+  param.output_dim = output_dim;
+  param.dtype = 0;
+
+  Engine::Get()->PushSync([idx, grad, output, param](RunContext ctx) {
+      std::vector<NDArray> inputs{grad, idx}, outputs{output, output};
+      // this is a hack
+      std::vector<OpReqType> req({kNullOp, kAddTo});
+      op::SparseEmbeddingOpBackwardEx<cpu>({}, {}, inputs, req, outputs);
+    }, output.ctx(), {grad.var(), idx.var()}, {output.var()},
+    FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
+
+  NDArray expected = DnsND(out_shape, ctx, {0,0,0,0,0,0,0,0});
+  Engine::Get()->PushSync([idx, grad, expected, param](RunContext ctx) {
+      std::vector<TBlob> inputs{grad.data(), idx.data()}, outputs{expected.data(), expected.data()};
+      std::vector<OpReqType> req({kNullOp, kWriteTo});
+      op::EmbeddingOpBackward<cpu>({}, {}, inputs, req, outputs);
+    }, expected.ctx(), {grad.var(), idx.var()}, {expected.var()},
+    FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
+  NDArray converted = Convert(kDefaultStorage, output);
+  expected.WaitToRead();
+  CheckDataRegion(converted.data(), expected.data());
+}
+
+
+TEST(NDArray, binary_add) {
+  BinaryAddRspRsp();
+}
+
 TEST(NDArray, conversion) {
-  DenseToDenseConversionTest();
-  SparseToDenseConversionTest();
+  CastDnsDnsTest();
+  CastRspDnsTest();
 }
 
 TEST(NDArray, functions) {
   SetValueTest();
 }
 
-TEST(NDArray, basics) {
-  BinaryRsRsTest();
-  //Wait for all operations to finish
-  Engine::Get()->WaitForAll();
-  InferElemwiseStorageTest();
-}
-
 TEST(NDArray, optimizer) {
   SGDDnsRspTest();
 }
 
+TEST(NDArray, copy) {
+  CopyFromToRspDnsTest();
+  CopyFromToRspRspReuseTest();
+  CopyFromToRspRspFreeTest();
+}
+
+TEST(NDArray, infer_storage) {
+  InferElemwiseStorageTest();
+}
+
+TEST(NDArray, sparse_embedding) {
+  SparseEmbeddingBackwardTest();
+}
diff --git a/tests/cpp/test_utils.h b/tests/cpp/test_utils.h
new file mode 100644
index 000000000000..45a7ba072934
--- /dev/null
+++ b/tests/cpp/test_utils.h
@@ -0,0 +1,105 @@
+#include <unistd.h>
+#include <dmlc/logging.h>
+#include <cstdio>
+#include <gtest/gtest.h>
+#include <vector>
+#include <mxnet/engine.h>
+#include <mxnet/ndarray.h>
+#include <cstdlib>
+
+#include "../src/operator/tensor/elemwise_binary_op.h"
+#include "../src/operator/tensor/elemwise_unary_op.h"
+#include "../src/operator/optimizer_op-inl.h"
+#include "../src/operator/tensor/init_op.h"
+
+using namespace mxnet;
+#define TEST_DTYPE float
+#define TEST_ITYPE int32_t
+
+void CheckDataRegion(const TBlob &src, const TBlob &dst) {
+  auto size = src.shape_.Size() * mshadow::mshadow_sizeof(src.type_flag_);
+  auto equals = memcmp(src.dptr_, dst.dptr_, size);
+  EXPECT_EQ(equals, 0);
+}
+
+float RandFloat() {
+  float v = rand() * 1.0 / RAND_MAX;
+  return v;
+}
+
+// Get an NDArray with provided indices, prepared for a RowSparse NDArray.
+NDArray RspIdxND(const TShape shape, const Context ctx, const std::vector<TEST_ITYPE> &values) {
+  NDArray nd(shape, ctx, false, ROW_SPARSE_IDX_TYPE);
+  size_t num_val = values.size();
+  MSHADOW_TYPE_SWITCH(nd.dtype(), DType, {
+    auto tensor = nd.data().FlatTo1D<cpu, DType>();
+    for (size_t i = 0; i < num_val; i++) {
+      tensor[i] = values[i];
+    }
+  });
+  return nd;
+}
+
+// Get a dense NDArray with provided values.
+NDArray DnsND(const TShape shape, const Context ctx, std::vector<TEST_DTYPE> vs) {
+  NDArray nd(shape, ctx, false);
+  size_t num_val = shape.Size();
+  // generate random values
+  while (vs.size() < num_val) {
+    auto v = RandFloat();
+    vs.push_back(v);
+  }
+  CHECK_EQ(vs.size(), nd.shape().Size());
+  MSHADOW_TYPE_SWITCH(nd.dtype(), DType, {
+    auto tensor = nd.data().FlatTo1D<cpu, DType>();
+    for (size_t i = 0; i < num_val; i++) {
+      tensor[i] = vs[i];
+    }
+  });
+  return nd;
+}
+
+// Get a RowSparse NDArray with provided indices and values
+NDArray RspND(const TShape shape, const Context ctx, const std::vector<TEST_ITYPE> idx,
+              std::vector<TEST_DTYPE> vals) {
+  CHECK(shape.ndim() <= 2) << "High dimensional row sparse not implemented yet";
+  index_t num_rows = idx.size();
+  index_t num_cols = vals.size() / idx.size();
+  // create index NDArray
+  NDArray index = RspIdxND(TShape({num_rows}), ctx, idx);
+  CHECK_EQ(vals.size() % idx.size(), 0);
+  // create value NDArray
+  NDArray data = DnsND(TShape({num_rows, num_cols}), ctx, vals);
+  // create result nd
+  NDArray nd(kRowSparseStorage, shape, ctx, false, mshadow::default_type_flag,
+             {}, {TShape({num_rows})});
+  // assign values
+  NDArray nd_aux = nd.AuxNDArray(0);
+  NDArray nd_data = nd.DataNDArray();
+  CopyFromTo(index, &nd_aux);
+  CopyFromTo(data, &nd_data);
+  return nd;
+}
+
+// TODO(haibin) support other types
+NDArray Convert(NDArrayStorageType type, NDArray src) {
+  CHECK_EQ(type, kDefaultStorage);
+  NDArray converted(src.shape(), src.ctx(), false);
+  Engine::Get()->PushSync([src, converted](RunContext ctx) {
+      // TODO provide type in attrs, which is empty now
+      OpContext op_ctx;
+      op_ctx.run_ctx = ctx;
+      if (src.storage_type() == kRowSparseStorage) {
+        std::vector<NDArray> inputs({src}), outputs({converted});
+        op::CastStorageComputeEx<cpu>({}, op_ctx, inputs, {}, outputs);
+      } else if (src.storage_type() == kDefaultStorage) {
+        std::vector<TBlob> inputs({src.data()}), outputs({converted.data()});
+        op::IdentityCompute<cpu>({}, op_ctx, inputs, {kWriteTo}, outputs);
+      } else {
+        LOG(FATAL) << "unsupported storage type";
+      }
+    }, src.ctx(), {src.var()}, {converted.var()},
+    FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME);
+  converted.WaitToRead();
+  return converted;
+}
diff --git a/tests/python/unittest/test_multi_device_exec.py b/tests/python/unittest/test_multi_device_exec.py
index f80f40ba7c32..d5da9eefcae9 100644
--- a/tests/python/unittest/test_multi_device_exec.py
+++ b/tests/python/unittest/test_multi_device_exec.py
@@ -38,7 +38,7 @@ def check_ctx_group_sparse(mode='dense_sparse'):
     dense_np = np.array([[1,2],[3,4],[5,6]])
     sparse_np1 = np.array([[5,10],[0,0],[0,0]])
     dense_nd = mx.nd.array(dense_np)
-    val = mx.nd.array([5, 10]);
+    val = mx.nd.array([[5, 10]]);
     idx = mx.nd.array([0], dtype=np.int32);
     sparse_nd1 = mx.sparse_nd.row_sparse(val, idx, (3,2))
     sparse_nd2 = mx.sparse_nd.row_sparse(val, idx, (3,2))
diff --git a/tests/python/unittest/test_optimizer.py b/tests/python/unittest/test_optimizer.py
index ae2a29fddc38..ac2a697d40d2 100755
--- a/tests/python/unittest/test_optimizer.py
+++ b/tests/python/unittest/test_optimizer.py
@@ -35,15 +35,15 @@ def compare_optimizer(opt1, opt2, shape, w_stype='default', g_stype='default'):
         w2 = mx.random.uniform(shape=shape, ctx=default_context())
         w1 = w2.copyto(default_context())
     elif w_stype == 'row_sparse':
-        w2 = random_sparse_ndarray(shape, w_stype, allow_zeros = False)
-        w1 = random_sparse_ndarray(shape, w_stype, allow_zeros = False).to_dense()
+        w2 = rand_ndarray(shape, w_stype)
+        w1 = rand_ndarray(shape, w_stype).to_dense()
     else:
         raise Exception("type not supported yet")
     if g_stype == 'default':
         g2 = mx.random.uniform(shape=shape, ctx=default_context())
         g1 = g2.copyto(default_context())
     elif g_stype == 'row_sparse':
-        g2 = random_sparse_ndarray(shape, g_stype, allow_zeros = False)
+        g2 = rand_ndarray(shape, g_stype)
         g1 = g2.copyto(default_context()).to_dense()
     else:
         raise Exception("type not supported yet")
diff --git a/tests/python/unittest/test_sparse_ndarray.py b/tests/python/unittest/test_sparse_ndarray.py
index c089e002c242..e47c2b2a75a1 100644
--- a/tests/python/unittest/test_sparse_ndarray.py
+++ b/tests/python/unittest/test_sparse_ndarray.py
@@ -1,17 +1,17 @@
 import os
-import random
 import mxnet as mx
 import numpy as np
 import pickle as pkl
 from mxnet.test_utils import *
 from numpy.testing import assert_allclose
+import numpy.random as rnd
 
 def check_sparse_nd_elemwise_binary(shapes, storage_types, f, g):
     # generate inputs
     nds = []
     for i, storage_type in enumerate(storage_types):
         if storage_type == 'row_sparse':
-            nd = random_sparse_ndarray(shapes[i], storage_type, allow_zeros = False)
+            nd, _ = rand_sparse_ndarray(shapes[i], storage_type)
         elif storage_type == 'default':
             nd = mx.nd.array(random_arrays(shapes[i]), dtype = np.float32)
         else:
@@ -26,7 +26,7 @@ def test_sparse_nd_elemwise_add():
     g = lambda x,y: x + y
     op = mx.nd.elemwise_add
     for i in xrange(num_repeats):
-        shape = [(random.randint(1, 10),random.randint(1, 10))] * 2
+        shape = [(rnd.randint(1, 10),rnd.randint(1, 10))] * 2
         check_sparse_nd_elemwise_binary(shape, ['default'] * 2, op, g)
         check_sparse_nd_elemwise_binary(shape, ['default', 'row_sparse'], op, g)
         check_sparse_nd_elemwise_binary(shape, ['row_sparse', 'row_sparse'], op, g)
@@ -37,48 +37,112 @@ def test_sparse_nd_elementwise_fallback():
     g = lambda x,y: x + y
     op = mx.nd.add_n
     for i in xrange(num_repeats):
-        shape = [(random.randint(1, 10),random.randint(1, 10))] * 2
+        shape = [(rnd.randint(1, 10), rnd.randint(1, 10))] * 2
         check_sparse_nd_elemwise_binary(shape, ['default'] * 2, op, g)
         check_sparse_nd_elemwise_binary(shape, ['default', 'row_sparse'], op, g)
         check_sparse_nd_elemwise_binary(shape, ['row_sparse', 'row_sparse'], op, g)
 
-def check_conversion_row_sparse():
-    val = np.array([5, 10])
-    idx = np.array([1])
-    sparse_val = np.array([[0, 0], [5, 10], [0, 0], [0, 0], [0, 0]])
-    a = mx.nd.array(val)
-    b = mx.nd.array(idx, dtype=np.int32)
-    d = mx.sparse_nd.array(a, [b], 'row_sparse', (5,2))
-    f = mx.sparse_nd.to_dense(d)
-    assert_almost_equal(f.asnumpy(), sparse_val)
-
-def test_sparse_nd_conversion():
-    check_conversion_row_sparse()
-
 def test_sparse_nd_zeros():
-    zero = mx.nd.zeros((2,2))
-    sparse_zero = mx.sparse_nd.zeros((2,2), 'row_sparse')
-    assert_almost_equal(sparse_zero.asnumpy(), zero.asnumpy())
+    def check_sparse_nd_zeros(shape, stype):
+        zero = mx.nd.zeros(shape)
+        sparse_zero = mx.sparse_nd.zeros('row_sparse', shape)
+        assert_almost_equal(sparse_zero.asnumpy(), zero.asnumpy())
+
+    shape = (rnd.randint(1, 10), rnd.randint(1, 10))
+    check_sparse_nd_zeros(shape, 'row_sparse')
+    check_sparse_nd_zeros(shape, 'csr')
 
-def check_sparse_nd_copy(storage_type):
-    c = random_sparse_ndarray((10, 10), storage_type, allow_zeros = True)
-    d = c.copyto(mx.Context('cpu', 0))
-    assert np.sum(np.abs(c.asnumpy() != d.asnumpy())) == 0.0
 
 def test_sparse_nd_copy():
-    check_sparse_nd_copy('row_sparse')
+    def check_sparse_nd_copy(from_stype, to_stype):
+        shape = (rnd.randint(1, 10), rnd.randint(1, 10))
+        from_nd = rand_ndarray(shape, from_stype)
+        # copy to ctx
+        to_ctx = from_nd.copyto(default_context())
+        # copy to stype
+        to_nd = rand_ndarray(shape, to_stype)
+        to_nd = from_nd.copyto(to_nd)
+        assert np.sum(np.abs(from_nd.asnumpy() != to_ctx.asnumpy())) == 0.0
+        assert np.sum(np.abs(from_nd.asnumpy() != to_nd.asnumpy())) == 0.0
 
-def test_sparse_nd_property():
+    check_sparse_nd_copy('row_sparse', 'row_sparse')
+    check_sparse_nd_copy('row_sparse', 'default')
+    check_sparse_nd_copy('default', 'row_sparse')
+
+def check_sparse_nd_prop_rsp():
     storage_type = 'row_sparse'
-    a = random_sparse_ndarray((10, 10), storage_type, allow_zeros = True)
-    assert(a.num_aux == 1)
-    assert(a.aux_type(0) == np.int32)
-    assert(a.storage_type == 'row_sparse')
+    shape = (rnd.randint(1, 2), rnd.randint(1, 2))
+    nd, (v, idx) = rand_sparse_ndarray(shape, storage_type)
+    assert(nd._num_aux == 1)
+    assert(nd.indices.dtype == np.int32)
+    assert(nd.storage_type == 'row_sparse')
+    assert_almost_equal(nd._data().asnumpy(), v)
+    assert_almost_equal(nd._aux_data(0).asnumpy(), idx)
+
+def test_sparse_nd_basic():
+    def check_rsp_creation(values, indices, shape):
+        rsp = mx.sparse_nd.row_sparse(values, indices, shape)
+        dns = mx.nd.zeros(shape)
+        dns[1] = mx.nd.array(values[0])
+        dns[3] = mx.nd.array(values[1])
+        assert_almost_equal(rsp.asnumpy(), dns.asnumpy())
+        indices = mx.nd.array(indices).asnumpy()
+        assert_almost_equal(rsp.indices.asnumpy(), indices)
+
+    def check_csr_creation(shape):
+        csr, (indptr, indices, values) = rand_sparse_ndarray(shape, 'csr')
+        assert_almost_equal(csr.indptr.asnumpy(), indptr)
+        assert_almost_equal(csr.indices.asnumpy(), indices)
+        assert_almost_equal(csr.values.asnumpy(), values)
+
+    shape = (4,2)
+    values = np.random.rand(2,2)
+    indices = np.array([1,3])
+    check_rsp_creation(values, indices, shape)
+
+    values = mx.nd.array(np.random.rand(2,2))
+    indices = mx.nd.array([1,3], dtype='int32')
+    check_rsp_creation(values, indices, shape)
+
+    values = [[0.1, 0.2], [0.3, 0.4]]
+    indices = [1,3]
+    check_rsp_creation(values, indices, shape)
+
+    check_csr_creation(shape)
+    check_sparse_nd_prop_rsp()
+
+
+def test_sparse_nd_setitem():
+    shape = (3, 4)
+    # ndarray assignment
+    x = mx.sparse_nd.zeros('row_sparse', shape)
+    x[:] = mx.nd.ones(shape)
+    x_np = np.ones(shape, dtype=x.dtype)
+    assert same(x.asnumpy(), x_np)
+
+    # numpy assignment
+    x = mx.sparse_nd.zeros('row_sparse', shape)
+    x[:] = np.ones(shape)
+    x_np = np.ones(shape, dtype=x.dtype)
+    assert same(x.asnumpy(), x_np)
+
+def test_sparse_nd_slice():
+    def check_sparse_nd_csr_slice(shape):
+        storage_type = 'csr'
+        A, _ = rand_sparse_ndarray(shape, storage_type)
+        A2 = A.asnumpy()
+        start = rnd.randint(0, shape[0] - 1)
+        end = rnd.randint(start + 1, shape[0])
+        assert same(A[start:end].asnumpy(), A2[start:end])
+
+    shape = (rnd.randint(2, 10), rnd.randint(1, 10))
+    check_sparse_nd_csr_slice(shape)
 
 if __name__ == '__main__':
-    test_sparse_nd_conversion()
     test_sparse_nd_zeros()
     test_sparse_nd_elementwise_fallback()
-    test_sparse_nd_elemwise_add()
     test_sparse_nd_copy()
-    test_sparse_nd_property()
+    test_sparse_nd_elemwise_add()
+    test_sparse_nd_setitem()
+    test_sparse_nd_basic()
+    test_sparse_nd_slice()
diff --git a/tests/python/unittest/test_sparse_operator.py b/tests/python/unittest/test_sparse_operator.py
index 6e7c97ca7f43..9ae86b25f94a 100644
--- a/tests/python/unittest/test_sparse_operator.py
+++ b/tests/python/unittest/test_sparse_operator.py
@@ -1,86 +1,48 @@
 # pylint: skip-file
 import numpy as np
 import mxnet as mx
+import scipy as sp
 from numpy.testing import assert_allclose
 from mxnet.test_utils import *
 
-
-def test_elemwise_add_dense():
-    data1 = mx.symbol.Variable('data1')
-    data2 = mx.symbol.Variable('data2')
-    shape = (1, 1)
-    data1_tmp = np.ones(shape)
-    data2_tmp = np.zeros(shape) + 2
-    test = mx.symbol.elemwise_add(data1, data2)
-    # check_numeric_gradient(test, [data_tmp])
-    check_symbolic_forward(test, {'data1':data1_tmp,
-                                  'data2':data2_tmp}, [data1_tmp + data2_tmp])
-    #check_symbolic_backward(test, [data_tmp], [np.ones(shape)], [2 * data_tmp])
-    arr_grad1 = mx.nd.empty(shape)
-    arr_grad2 = mx.nd.empty(shape)
-    # init grad arrays before bind
-    exec_test = test.bind(default_context(), args={'data1':mx.nd.array(data1_tmp), 'data2':mx.nd.array(data2_tmp)},
-                          args_grad=[arr_grad1, arr_grad2])
-    exec_test.forward(is_train=True)
-    assert_almost_equal(exec_test.outputs[0].asnumpy(), data1_tmp + data2_tmp)
-    exec_test.backward(out_grads = exec_test.outputs)
-    assert_almost_equal(arr_grad1.asnumpy(), arr_grad2.asnumpy())
-
-
-def test_elemwise_add_dense_sparse():
-    # prep data
-    dense_np = np.array([[1,2],[3,4],[5,6]])
-    sparse_np1 = np.array([[5,10],[0,0],[0,0]])
-    dense_nd = mx.nd.array(dense_np)
-
-    val = mx.nd.array([5, 10]);
-    idx = mx.nd.array([0], dtype=np.int32);
-    sparse_nd1 = mx.sparse_nd.row_sparse(val, idx, (3,2))
-
-    data1 = mx.symbol.Variable('data1')
-    data2 = mx.symbol.Variable('data2', storage_type='row_sparse')
-    test = mx.symbol.elemwise_add(data1, data2, name='plus')
-    check_symbolic_forward(test, {'data1':dense_nd,
-                                  'data2':sparse_nd1}, [dense_np + sparse_np1])
-
-
-def test_elemwise_add_sparse_sparse():
-    # prep data
-    shape = (4, 2)
-    sparse_np1 = np.array([[5,10],[0,0],[0,0],[0,0]])
-    sparse_np2 = np.array([[0,0],[5,10],[0,0],[0,0]])
-
-    val1 = mx.nd.array([5, 10])
-    val2 = mx.nd.array([5, 10])
-    idx1 = mx.nd.array([0], dtype=np.int32);
-    idx2 = mx.nd.array([1], dtype=np.int32);
-    sparse_nd1 = mx.sparse_nd.row_sparse(val1, idx1, shape)
-    sparse_nd2 = mx.sparse_nd.row_sparse(val2, idx2, shape)
-
-    data1 = mx.symbol.Variable('data1', storage_type='row_sparse')
-    data2 = mx.symbol.Variable('data2', storage_type='row_sparse')
-    test = mx.symbol.elemwise_add(data1, data2, name='plus')
-    check_symbolic_forward(test, {'data1':sparse_nd1,
-                                  'data2':sparse_nd2}, [sparse_np1 + sparse_np2])
-    arr_grad1 = mx.sparse_nd.zeros(shape, 'row_sparse')
-    arr_grad2 = mx.sparse_nd.zeros(shape, 'row_sparse')
-    exec_test = test.bind(default_context(), args={'data1':sparse_nd1, 'data2':sparse_nd2},
-                          args_grad=[arr_grad1, arr_grad2])
-    exec_test.forward(is_train=True)
-    assert_almost_equal(exec_test.outputs[0].asnumpy(), sparse_np1 + sparse_np2)
-    exec_test.backward(out_grads = exec_test.outputs)
-    assert_almost_equal(arr_grad1.asnumpy(), arr_grad2.asnumpy())
-
-
-def test_elemwise_add_multiple_stages():
+def check_elemwise_add_ex(lhs_stype, rhs_stype, shape, lhs_grad_stype=None, rhs_grad_stype=None):
+    lhs = mx.symbol.Variable('lhs', storage_type = lhs_stype)
+    rhs = mx.symbol.Variable('rhs', storage_type = rhs_stype)
+    if lhs_grad_stype is not None:
+        lhs._set_attr(grad_stype_hint=str(lhs_grad_stype))
+    if rhs_grad_stype is not None:
+        rhs._set_attr(grad_stype_hint=str(rhs_grad_stype))
+
+    lhs_nd = rand_ndarray(shape, lhs_stype)
+    rhs_nd = rand_ndarray(shape, rhs_stype)
+    lhs_np = lhs_nd.asnumpy()
+    rhs_np = rhs_nd.asnumpy()
+
+    out_np = lhs_np + rhs_np
+    test = mx.symbol.elemwise_add(lhs, rhs)
+    location = {'lhs':lhs_nd, 'rhs':rhs_nd}
+    check_symbolic_forward(test, location, [out_np])
+    check_numeric_gradient(test, location)
+    check_symbolic_backward(test, location, [out_np], [out_np, out_np])
+
+def test_elemwise_add_ex():
+    shape = (rnd.randint(1, 10),rnd.randint(1, 10))
+    check_elemwise_add_ex('default', 'default', shape)
+    check_elemwise_add_ex('default', 'row_sparse', shape)
+    check_elemwise_add_ex('row_sparse', 'default', shape)
+    check_elemwise_add_ex('row_sparse', 'row_sparse', shape,
+                       lhs_grad_stype='row_sparse', rhs_grad_stype='row_sparse')
+
+# TODO(haibin) randomize this test
+def test_elemwise_add_ex_multiple_stages():
     # prep data
     shape = (4, 2)
     ds_np = np.array([[1,2],[3,4],[5,6],[7,8]])
     sp_np1 = np.array([[5,10],[0,0],[0,0],[0,0]])
     sp_np2 = np.array([[0,0],[5,10],[0,0],[0,0]])
 
-    val1 = mx.nd.array([5, 10]);
-    val2 = mx.nd.array([5, 10]);
+    val1 = mx.nd.array([[5, 10]]);
+    val2 = mx.nd.array([[5, 10]]);
     idx1 = mx.nd.array([0], dtype=np.int32);
     idx2 = mx.nd.array([1], dtype=np.int32);
     sp_nd1 = mx.sparse_nd.row_sparse(val1, idx1, shape)
@@ -105,37 +67,28 @@ def test_elemwise_add_multiple_stages():
     exec_test.backward(out_grads = exec_test.outputs)
     assert_almost_equal(arr_grads[0].asnumpy(), arr_grads[1].asnumpy())
 
-
-def test_cast_storage():
-    def test_rsp_to_dns(data, row_idx, shape):
-        rsp = mx.sparse_nd.array(values=data, index_list=[row_idx], storage_type='row_sparse', shape=shape)
+# TODO(haibin) also add test for backward pass
+def test_cast_storage_ex():
+    def test_rsp_to_dns(shape):
+        rsp, (data, row_idx) = rand_sparse_ndarray(shape, 'row_sparse')
         dns_out = mx.nd.cast_storage(rsp, storage_type='default')
         dns_expected = np.zeros(shape, dtype=default_dtype())
-        for k, v in enumerate(row_idx):
-            dns_expected[v, :] = data[k]
+        if row_idx is not None:
+            for k, v in enumerate(row_idx):
+                dns_expected[v, :] = data[k]
         assert same(dns_out.asnumpy(), dns_expected)
 
-    def test_dns_to_rsp(dns_in):
-        dns_in = np.array(dns_in)
+    def test_dns_to_rsp(shape):
+        dns_in = rand_ndarray(shape, 'default')
         rsp_out = mx.nd.cast_storage(mx.nd.array(dns_in, dtype=default_dtype()), storage_type='row_sparse')
         ret = mx.nd.cast_storage(rsp_out, storage_type='default')
-        assert same(ret.asnumpy(), dns_in)
+        assert same(ret.asnumpy(), dns_in.asnumpy())
 
-    def test_csr_to_dns(data, indptr, col_idx, shape):
-        indptr = np.array(indptr, dtype=np.int32)
-        col_idx = np.array(col_idx, dtype=np.int32)
-        csr = mx.sparse_nd.array(values=data, index_list=[col_idx, indptr], storage_type='csr', shape=shape,
-                                 aux_types=[np.int32, np.int32])
-        dns_out = mx.nd.cast_storage(csr, storage_type='default')
-        dns_expected = np.zeros(shape, dtype=default_dtype())
-        i = 0
-        while i < len(indptr) - 1:
-            j = indptr[i]
-            while j < indptr[i+1]:
-                dns_expected[i, col_idx[j]] = data[j]
-                j = j + 1
-            i = i + 1
-        assert same(dns_out.asnumpy(), dns_expected)
+    def test_csr_to_dns(shape):
+        csr, (indptr, indices, values) = rand_sparse_ndarray(shape, 'csr')
+        mx_dns = csr.to_dense()
+        np_dns = sp.sparse.csr_matrix((values, indices, indptr), shape).todense()
+        assert_almost_equal(mx_dns.asnumpy(), np_dns)
 
     def test_dns_to_csr(dns_in):
         dns_in= np.array(dns_in)
@@ -143,11 +96,10 @@ def test_dns_to_csr(dns_in):
         ret = mx.nd.cast_storage(csr_out, storage_type='default')
         assert same(ret.asnumpy(), dns_in)
 
-    test_rsp_to_dns([], [], (10, 3))
-    test_rsp_to_dns([[1, 2], [3, 4], [5, 6], [7, 8]], [2, 4, 5, 7], (10, 2))
-    test_dns_to_rsp([[0, 1, 0], [0, 2, 0], [3, 0, 0], [0, 0, 4], [5, 6, 0], [0, 0, 7]])
-    test_csr_to_dns([], [0, 0, 0, 0, 0], [], (4, 4))
-    test_csr_to_dns([5, 8, 3, 6], [0, 0, 2, 3, 4], [0, 1, 2, 1], (4, 4))
+    shape = (rnd.randint(1, 10),rnd.randint(1, 10))
+    test_rsp_to_dns(shape)
+    test_dns_to_rsp(shape)
+    test_csr_to_dns((4, 4))
     test_dns_to_csr([[0, 1, 0], [0, 2, 0], [3, 0, 0], [0, 0, 4], [5, 6, 0], [0, 0, 7]])
 
 # TODO(junwu): The backward of the operator dot cannot be tested for now
@@ -157,38 +109,75 @@ def test_dns_to_csr(dns_in):
 # the same impl function of dot(csr, dns) = rsp and it has been tested
 # in the forward test cases as the following.
 def test_sparse_dot():
-    def test_dot_csr_dns_rsp(dns1, dns2, trans_csr):
-        dns1 = mx.nd.array(dns1)
-        dns2 = mx.nd.array(dns2)
+    def test_dot_csr_dns_rsp(csr_shape, dns_shape, dns_grad_stype, trans_csr):
+        dns1 = rand_ndarray(csr_shape, 'default')
+        dns2 = rand_ndarray(dns_shape, 'default')
         csr = mx.nd.cast_storage(dns1, storage_type='csr')
         rsp_out = mx.nd.dot(csr, dns2, transpose_a=trans_csr)
-        rsp_expected = mx.nd.dot(csr.to_dense(), dns2, transpose_a=trans_csr)
+        rsp_expected = mx.nd.dot(dns1, dns2, transpose_a=trans_csr)
+        out_np = rsp_expected.asnumpy()
+        backward_trans = not trans_csr
+        rhs_backward_grad = mx.nd.dot(dns1, rsp_expected, transpose_a=backward_trans).asnumpy()
         # TODO(junwu): may need to compare rsp_out and rsp_expected in rsp format
         # instead of converting them to the dense format
-        assert same(rsp_out.asnumpy(), rsp_expected.asnumpy())
+        assert same(rsp_out.asnumpy(), out_np)
 
         # test symbolic forward
         lhs = mx.symbol.Variable('lhs', storage_type='csr')
         rhs = mx.symbol.Variable('rhs', storage_type='default')
-        sym_dot = mx.symbol.dot(lhs, rhs, transpose_a=trans_csr)
-        dns2_grad = mx.sparse_nd.zeros(dns2.shape, 'row_sparse')
-        exec_dot = sym_dot.bind(default_context(), args={'lhs': csr, 'rhs': dns2}, args_grad={'rhs': dns2_grad},
-                                grad_req={'lhs': 'null', 'rhs': 'write'})
-        exec_dot.forward(is_train=True)
-        assert same(exec_dot.outputs[0].asnumpy(), rsp_expected.asnumpy())
-
-    test_dot_csr_dns_rsp(dns1=[[0, 0, 1, 4], [2, 0, 0, 0], [0, 0, 0, 0], [2, 9, 0, 5], [0, 0, 0, 1]],
-                         dns2=[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]],
-                         trans_csr=False)
-    test_dot_csr_dns_rsp(dns1=[[0, 0, 1, 4], [2, 0, 0, 0], [0, 0, 0, 0], [2, 9, 0, 5], [0, 0, 0, 1]],
-                         dns2=[[1, 2, 3, 4, 5], [5, 6, 7, 8, 6], [9, 10, 11, 12, 6], [13, 14, 15, 16, 7],
-                               [1, 1, 1, 1, 2]], trans_csr=True)
+        rhs._set_attr(grad_stype_hint=str(dns_grad_stype))
+        # TODO(haibin) since backward op is not fully implemented, here we add a dense zero ndarray
+        # so that the output gradient is dense.
+        zeros = mx.symbol.Variable('zero', storage_type='default')
 
+        sym_dot = mx.symbol.dot(lhs, rhs, transpose_a=trans_csr)
+        test = mx.symbol.elemwise_add(sym_dot, zeros)
+        location = {'lhs':csr, 'rhs':dns2, 'zero':mx.nd.zeros(rsp_expected.shape)}
+        expected = {'rhs':rhs_backward_grad, 'zero':out_np}
+        # dot(lhs, rhs) + zeros
+        check_symbolic_forward(test, location, [rsp_expected.asnumpy()])
+        check_symbolic_backward(test, location, [out_np], expected,
+                                grad_req={'lhs': 'null', 'rhs': 'write', 'zero' : 'write'})
+
+    lhs_shape = (rnd.randint(1, 10),rnd.randint(1, 10))
+    test_dot_csr_dns_rsp(lhs_shape, (lhs_shape[1], rnd.randint(1, 10)), 'row_sparse', False)
+    test_dot_csr_dns_rsp(lhs_shape, (lhs_shape[0], rnd.randint(1, 10)), 'row_sparse', True)
+    test_dot_csr_dns_rsp(lhs_shape, (lhs_shape[1], rnd.randint(1, 10)), 'default', False)
+    test_dot_csr_dns_rsp(lhs_shape, (lhs_shape[0], rnd.randint(1, 10)), 'default', True)
+
+'''
+def test_sparse_embedding():
+    in_dim = 10
+    out_dim = 4
+    batch = 24
+
+    data = mx.sym.Variable("data", dtype=np.int32)
+    embed = mx.sym.SparseEmbedding(data=data, input_dim=in_dim, output_dim=out_dim, name="embed")
+    # TODO(haibin) test again when simple_bind cpp api is ready
+    exe_test = embed.simple_bind(default_context(), grad_req={'data': 'null', 'embed_weight': 'write'},
+                                 data=(batch,))
+    arg_map = dict(zip(embed.list_arguments(), exe_test.arg_arrays))
+    grad_map = dict(zip(embed.list_arguments(), exe_test.grad_arrays))
+    np_data = np.random.randint(low=0, high=in_dim, size=batch)
+    np_weight = np.random.uniform(-0.01, 0.01, arg_map["embed_weight"].shape)
+    np_onehot = np.zeros((batch, in_dim))
+    np_onehot[np.arange(batch), np_data] = 1.0
+    # forward
+    arg_map["data"][:] = np_data
+    arg_map["embed_weight"][:] = np_weight
+    exe_test.forward(is_train=True)
+    assert_almost_equal(exe_test.outputs[0].asnumpy(), np.dot(np_onehot, np_weight))
+    # backward
+    np_grad = np.random.uniform(-1, 1, exe_test.outputs[0].shape)
+    grad = mx.nd.zeros(np_grad.shape)
+    grad[:] = np_grad
+    exe_test.backward([grad])
+    assert_almost_equal(grad_map["embed_weight"].asnumpy(), np.dot(np_onehot.T, np_grad))
+'''
 
 if __name__ == '__main__':
-    test_elemwise_add_dense()
-    test_elemwise_add_dense_sparse()
-    test_elemwise_add_sparse_sparse()
-    test_elemwise_add_multiple_stages()
-    test_cast_storage()
+    test_elemwise_add_ex()
+    test_elemwise_add_ex_multiple_stages()
+    test_cast_storage_ex()
     test_sparse_dot()
+    #test_sparse_embedding()
diff --git a/tests/python/unittest/test_symbol.py b/tests/python/unittest/test_symbol.py
index 414c1a1ddfe2..ab25f48eeb52 100644
--- a/tests/python/unittest/test_symbol.py
+++ b/tests/python/unittest/test_symbol.py
@@ -233,7 +233,6 @@ def test_zero_prop2():
     test_symbol_infer_shape_var()
     test_symbol_infer_shape()
     test_symbol_infer_type()
-    #TODO test infer storage type
     test_symbol_internal()
     test_symbol_basic()
     test_symbol_compose()