diff --git a/Jenkinsfile b/Jenkinsfile index e01621dc5374..9abe9c0365bd 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -205,7 +205,7 @@ del /Q *.7z def python_ut(docker_type) { timeout(time: max_time, unit: 'MINUTES') { sh "${docker_run} ${docker_type} PYTHONPATH=./python/ nosetests-2.7 --with-timer --verbose tests/python/unittest" - // sh "${docker_run} ${docker_type} PYTHONPATH=./python/ nosetests-3.4 --with-timer --verbose tests/python/unittest" + sh "${docker_run} ${docker_type} PYTHONPATH=./python/ nosetests-3.4 --with-timer --verbose tests/python/unittest" sh "${docker_run} ${docker_type} PYTHONPATH=./python/ nosetests-2.7 --with-timer --verbose tests/python/train" } } @@ -215,7 +215,7 @@ def python_ut(docker_type) { def python_gpu_ut(docker_type) { timeout(time: max_time, unit: 'MINUTES') { sh "${docker_run} ${docker_type} PYTHONPATH=./python/ nosetests-2.7 --with-timer --verbose tests/python/gpu" - // sh "${docker_run} ${docker_type} PYTHONPATH=./python/ nosetests-3.4 --with-timer --verbose tests/python/gpu" + sh "${docker_run} ${docker_type} PYTHONPATH=./python/ nosetests-3.4 --with-timer --verbose tests/python/gpu" } } diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h index 12e2a270b02e..c4304b172985 100644 --- a/include/mxnet/c_api.h +++ b/include/mxnet/c_api.h @@ -222,21 +222,6 @@ MXNET_DLL int MXNDArrayCreate(const mx_uint *shape, int delay_alloc, NDArrayHandle *out); -/*! - * \brief create a NDArray with specified sparse type, shape and aux data(e.g. index) - * aux data is copied during construction. - */ -MXNET_DLL int MXNDArrayCreateSparse(NDArrayHandle data, - mx_uint num_aux, - NDArrayHandle *aux_data, - const mx_uint *shape, - mx_uint ndim, - int storage_type, - int dev_type, - int dev_id, - int delay_alloc, - int dtype, - NDArrayHandle *out); /*! * \brief create a NDArray with specified shape and data type * \param shape the pointer to the shape @@ -260,6 +245,20 @@ MXNET_DLL int MXNDArrayCreateEx(const mx_uint *shape, /*! * \brief create an empty sparse NDArray with specified shape and data type + * \param storage_type the storage type of the ndarray + * \param shape the pointer to the shape + * \param ndim the dimension of the shape + * \param dev_type device type, specify device we want to take + * \param dev_id the device id of the specific device + * \param delay_alloc whether to delay allocation until + * the narray is first mutated + * \param dtype data type of created array + * \param num_aux the number of aux data to support this ndarray + * \param aux_type data type of the aux data for the created array + * \param aux_ndims the dimension of the shapes of aux data + * \param aux_shape the shapes of aux data + * \param out the returning handle + * \return 0 when success, -1 when failure happens */ MXNET_DLL int MXNDArrayCreateSparseEx(int storage_type, const mx_uint *shape, @@ -269,7 +268,9 @@ MXNET_DLL int MXNDArrayCreateSparseEx(int storage_type, int delay_alloc, int dtype, mx_uint num_aux, - int *aux_types, + int *aux_type, + mx_uint *aux_ndims, + const mx_uint *aux_shape, NDArrayHandle *out); /*! @@ -439,13 +440,26 @@ MXNET_DLL int MXNDArrayGetData(NDArrayHandle handle, */ MXNET_DLL int MXNDArrayGetDType(NDArrayHandle handle, int *out_dtype); -// Get the aux type for ith aux data + +/*! + * \brief get the type of the ith aux data in NDArray + * \param handle the handle to the narray + * \param i the index of the aux data + * \param out_type pointer holder to get type of aux data + * \return 0 when success, -1 when failure happens + */ MXNET_DLL int MXNDArrayGetAuxType(NDArrayHandle handle, mx_uint i, - int *out_aux_type); -// Get the num of aux data to help store sparse NDArray -MXNET_DLL int MXNDArrayGetNumAux(NDArrayHandle handle, - mx_uint *out_num_aux); + int *out_type); + +// Get the ith aux data blob wrapped in an NDArray +MXNET_DLL int MXNDArrayGetAuxNDArray(NDArrayHandle handle, + mx_uint i, + NDArrayHandle *out); + +// Get the data blob wrapped in an NDArray +MXNET_DLL int MXNDArrayGetDataNDArray(NDArrayHandle handle, + NDArrayHandle *out); /*! * \brief get the context of the NDArray * \param handle the handle to the narray diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h index 86a6afc4d8e9..71fc78527707 100644 --- a/include/mxnet/ndarray.h +++ b/include/mxnet/ndarray.h @@ -15,6 +15,7 @@ #include #include #include +#include #include #include "./base.h" #include "./storage.h" @@ -28,8 +29,22 @@ #endif namespace mxnet { -// forward declaration +// forward declarations class NDArray; + +namespace op { +template +void FillZerosRspImpl(mshadow::Stream *s, NDArray *dst); + +template +void CastStorageComputeImpl(mshadow::Stream *s, const NDArray& input, const NDArray& output); +}; + +namespace ndarray { +template +void Copy(const TBlob &from, TBlob *to, Context from_ctx, Context to_ctx, RunContext ctx); +}; + namespace autograd { class AGNode; @@ -72,19 +87,6 @@ enum NDArrayStorageType { kCSRStorage, // csr }; -/*! - * \brief issue an copy operation from one NDArray to another - * the two ndarray can sit on different devices - * this operation will be scheduled by the engine - * - * \param from the ndarray we want to copy data from - * \param to the target ndarray - * \param priority Priority of the action. - * \param alloc_output whether to allocate memory for the output ndarray - * \note The function name explicitly marks the order of from and to - * due to different possible convention carried by copy function. - */ -void CopyFromTo(const NDArray &from, NDArray *to, int priority = 0, bool alloc_output = true); /*! * \brief ndarray interface @@ -116,7 +118,8 @@ class NDArray { */ NDArray(const NDArrayStorageType storage_type, const TShape &shape, Context ctx, bool delay_alloc = true, int dtype = mshadow::default_type_flag, - std::vector aux_types = {}) + std::vector aux_types = {}, std::vector aux_shapes = {}, + TShape storage_shape = TShape({0})) : shape_(shape), offset_(0), dtype_(dtype), entry_({nullptr, 0, 0}) { // Assign default aux types if not given if (aux_types.size() == 0) { @@ -128,7 +131,30 @@ class NDArray { LOG(FATAL) << "Unknown storage type"; } } - ptr_ = std::make_shared(ctx, delay_alloc, aux_types, storage_type); + // Assign default shapes if not given + // unknown shapes are intialized as {0} such that Size() would return 0 + if (aux_shapes.size() == 0) { + if (storage_type == kRowSparseStorage) { + aux_shapes = {TShape({0})}; + } else if (storage_type == kCSRStorage) { + // aux shapes for indptr and indices + aux_shapes = {TShape({0}), TShape({0})}; + } else { + LOG(FATAL) << "Unknown storage type"; + } + } + if (storage_shape.Size() == 0) { + if (storage_type == kRowSparseStorage) { + storage_shape = shape; + storage_shape[0] = aux_shapes[rowsparse::kIdx][0]; + } else if (storage_type == kCSRStorage) { + storage_shape = aux_shapes[csr::kIdx]; + } else { + LOG(FATAL) << "Unknown storage type"; + } + } + ptr_ = std::make_shared(storage_type, storage_shape, ctx, delay_alloc, + dtype, aux_types, aux_shapes); #if MKL_EXPERIMENTAL == 1 Mkl_mem_ = std::make_shared(); #endif @@ -149,14 +175,6 @@ class NDArray { Mkl_mem_ = std::make_shared(); #endif } - NDArray(NDArray data, const std::vector aux_data, Context ctx, - NDArrayStorageType storage_type, const TShape &shape) - : ptr_(std::make_shared(data, aux_data, ctx, storage_type)), shape_(shape), - offset_(0), dtype_(data.data().type_flag_), entry_({nullptr, 0, 0}) { -#if MKL_EXPERIMENTAL == 1 - Mkl_mem_ = std::make_shared(); -#endif - } /*! * \return the shape of current NDArray. @@ -195,7 +213,6 @@ class NDArray { // TODO(haibin) CamelCase inline const TShape aux_shape(size_t i) const { CHECK(storage_type() != kDefaultStorage); - if (i >= ptr_->aux_shapes.size()) return TShape(); return ptr_->aux_shapes[i]; } @@ -217,14 +234,19 @@ class NDArray { CHECK(ptr_ != nullptr); TBlob res; TShape shape = shape_; - if (storage_type() != kDefaultStorage) { - CHECK(offset_ == 0) << "Non-default storage should never set offset_"; - shape = storage_shape(); - } + auto stype = storage_type(); MSHADOW_TYPE_SWITCH(dtype(), DType, { - CHECK(ptr_->shandle.dptr != nullptr); - res = TBlob(static_cast(ptr_->shandle.dptr) - + offset_, shape, ptr_->shandle.ctx.dev_mask(), dtype()); + auto dptr = static_cast(ptr_->shandle.dptr); + if (stype == kDefaultStorage) { + dptr += offset_; + } else if (stype == kCSRStorage) { + shape = storage_shape(); + } else if (stype == kRowSparseStorage) { + shape = storage_shape(); + } else { + LOG(FATAL) << "unknown storage type " << stype; + } + res = TBlob(dptr, shape, ptr_->shandle.ctx.dev_mask(), dtype()); }); #if MKL_EXPERIMENTAL == 1 res.Mkl_mem_ = Mkl_mem_; @@ -235,12 +257,23 @@ class NDArray { * \return the aux TBlob */ inline TBlob aux_data(size_t i) const { - CHECK(storage_type() != kDefaultStorage); + auto stype = storage_type(); TBlob res; - CHECK(i < ptr_->aux_handles.size()); - MSHADOW_TYPE_SWITCH(aux_type(i), DType, { - res = TBlob(static_cast(ptr_->aux_handles[i].dptr), aux_shape(i), - ptr_->aux_handles[i].ctx.dev_mask(), aux_type(i)); + auto shape = aux_shape(i); + auto type = aux_type(i); + MSHADOW_TYPE_SWITCH(type, DType, { + auto dptr = static_cast(ptr_->aux_handles[i].dptr); + if (stype == kRowSparseStorage) { + CHECK_EQ(offset_, 0); + } else if (stype == kCSRStorage) { + if (i == csr::kIndPtr) { + dptr += offset_; + shape[0] = shape_[0] + 1; + } + } else { + LOG(FATAL) << "Unexpected storage type"; + } + res = TBlob(dptr, shape, ptr_->aux_handles[i].ctx.dev_mask(), type); }); #if MKL_EXPERIMENTAL == 1 res.Mkl_mem_ = Mkl_mem_; @@ -284,14 +317,22 @@ class NDArray { if (is_none()) return kUndefinedStorage; return ptr_->storage_type; } - inline size_t num_aux() const { - if (is_none()) return 0; - return ptr_->aux_handles.size(); - } /*! \return whether this ndarray is not initialized */ inline bool is_none() const { return ptr_.get() == nullptr; } + // returns true if a sparse ndarray's aux_data and storage are initialized + inline bool storage_initialized() const { + if (is_none()) return false; + auto stype = storage_type(); + CHECK_NE(stype, kDefaultStorage); + if (stype == kRowSparseStorage || stype == kCSRStorage) { + return aux_shape(0).Size() != 0; + } else { + LOG(FATAL) << "Unknown storage type"; + } + return true; + } /*! * \brief Block until all the pending write operations with respect * to current NDArray are finished, and read can be performed. @@ -425,19 +466,28 @@ class NDArray { void SyncCopyToCPU(void *data, size_t size) const; /*! * \brief Slice a NDArray - * \param begin begin index in first dim - * \param end end index in first dim + * \param begin begin index in first dim (inclusive) + * \param end end index in first dim (exclusive) * \return sliced NDArray */ inline NDArray Slice(index_t begin, index_t end) const { NDArray ret = *this; CHECK(!is_none()) << "NDArray is not initialized"; CHECK_GE(shape_[0], end) << "Slice end index out of range"; - CHECK(storage_type() == kDefaultStorage) << "Slice not yet implemented for storage " - << storage_type(); - size_t length = shape_.ProdShape(1, shape_.ndim()); - ret.offset_ += begin * length; - ret.shape_[0] = end - begin; + auto stype = storage_type(); + if (stype == kDefaultStorage) { + size_t length = shape_.ProdShape(1, shape_.ndim()); + ret.offset_ += begin * length; + ret.shape_[0] = end - begin; + } else if (stype == kCSRStorage) { + // for csr, the offset variable is used to adjust indptr + // while getting aux_data, the dptr of indptr is advanced by offset, + // and shape for indptr is end - begin + 1 + ret.offset_ += begin; + ret.shape_[0] = end - begin; + } else { + LOG(FATAL) << "Slice not yet implemented for storage " << stype; + } return ret; } /*! @@ -460,6 +510,19 @@ class NDArray { } return ret; } + // Wrap the tblob of aux data into an NDArray which shares the same variable with the + // current one. + inline const NDArray AuxNDArray(size_t i) const { + CHECK_NE(storage_type(), kDefaultStorage); + CHECK(i < ptr_->aux_shapes.size()); + return NDArray(aux_data(i), ctx().dev_id, var()); + } + // Wrap the tblob of data into an NDArray which shares the same variable with the + // current one. + inline const NDArray DataNDArray() const { + CHECK_NE(storage_type(), kDefaultStorage); + return NDArray(data(), ctx().dev_id, var()); + } /*! * \brief Create a NDArray that shares memory with current one * The new array must have smaller memory size than the current array. @@ -562,7 +625,8 @@ class NDArray { */ /*! \brief construct from static data */ bool static_data; - /*! \brief whether allocation is delayed. */ + /*! \brief whether data allocation is delayed. This doesn't indicate whether aux data + allocation is delayed. */ bool delay_alloc; // the type of the storage. The storage_type is never kUndefinedStorage once the chunk // is constructed. @@ -573,8 +637,10 @@ class NDArray { Context ctx; // The shape of the chunk data. // This might not be the same shape as the NDArray, since the storage may be sparse. + // The default value for storage_shape is {0} when an empty non-default NDArray is created. TShape storage_shape; - // The shape of aux data. The default value for the shape is 0. + // The shape of aux data. The default value for the shape depends on the type of storage. + // If aux_shapes[i].Size() is zero, aux data i is empty. std::vector aux_shapes; // \brief skip the deletion of var handle. Usually set when shared_var is present. bool skip_delete_var = false; @@ -589,44 +655,6 @@ class NDArray { shandle.ctx = ctx_; if (!delay_alloc_) this->CheckAndAlloc(); } - // construct a chunk by copying over data - Chunk(const NDArray &nd, const std::vector &nd_aux, Context ctx_, - NDArrayStorageType storage_type_) - : static_data(false), delay_alloc(false), storage_type(storage_type_), ctx(ctx_) { - // Vars - var = Engine::Get()->NewVariable(); - // Data Storage - const auto &data = nd.data(); - storage_shape = data.shape_; - shandle.ctx = ctx; - shandle.size = data.shape_.Size() * mshadow::mshadow_sizeof(data.type_flag_); - shandle = Storage::Get()->Alloc(shandle.size, shandle.ctx); - - // Copy data - // Single threaded copy may not saturate memory bandwidth - CHECK_EQ(nd.storage_type(), kDefaultStorage); - auto data_blob = TBlob(shandle.dptr, storage_shape, shandle.ctx.dev_mask(), data.type_flag_); - NDArray data_wrapper(data_blob, ctx.dev_id, var); - CopyFromTo(nd, &data_wrapper, 0, false); - - // Aux shapes, types and storage - CHECK_GT(storage_shape.ndim(), 0); - for (size_t i = 0; i < nd_aux.size(); i++) { - const auto &aux_d = nd_aux[i].data(); - aux_shapes.emplace_back(aux_d.shape_); - aux_types.emplace_back(aux_d.type_flag_); - Storage::Handle aux_handle; - aux_handle.ctx = ctx; - aux_handle.size = aux_shapes[i].Size() * mshadow::mshadow_sizeof(aux_types[i]); - aux_handle = Storage::Get()->Alloc(aux_handle.size, aux_handle.ctx); - aux_handles.emplace_back(aux_handle); - // Copy aux data - CHECK_EQ(nd_aux[i].storage_type(), kDefaultStorage); - TBlob aux_blob(aux_handle.dptr, aux_shapes[i], ctx.dev_mask(), aux_types[i]); - NDArray aux_wrapper(aux_blob, ctx.dev_id, var); - CopyFromTo(nd_aux[i], &aux_wrapper, 0, false); - } - } Chunk(const TBlob &data, int dev_id, Engine::VarHandle shared_var) : static_data(true), delay_alloc(false) { @@ -651,15 +679,20 @@ class NDArray { shandle.size = data.shape_.Size() * mshadow::mshadow_sizeof(data.type_flag_); storage_shape = data.shape_; } - Chunk(Context ctx_, bool delay_alloc_, std::vector aux_types_, - NDArrayStorageType storage_type_) + // Constructor for a non-default storage chunk + Chunk(NDArrayStorageType storage_type_, const TShape &storage_shape_, Context ctx_, + bool delay_alloc_, int dtype, const std::vector &aux_types_, + const std::vector &aux_shapes_) : static_data(false), delay_alloc(delay_alloc_), storage_type(storage_type_), - aux_types(aux_types_), ctx(ctx_) { + aux_types(aux_types_), ctx(ctx_), storage_shape(storage_shape_), + aux_shapes(aux_shapes_) { var = Engine::Get()->NewVariable(); - // Assume alloc is always delayed for non-default storage type - CHECK(delay_alloc_); - if (!delay_alloc_) { - this->CheckAndAlloc(); + // aux_handles always reflect the correct number of aux data + for (size_t i = 0; i < aux_shapes.size(); i++) { + CheckAndAllocAuxData(i, aux_shapes[i]); + } + if (!delay_alloc) { + CheckAndAllocData(storage_shape, dtype); } } /*! \brief check if delay alloc is on, do alloc if not yet done */ @@ -672,55 +705,63 @@ class NDArray { inline void CheckAndAlloc(const TShape &shape, const std::vector &aux_shapes, int dtype) { // calculate size, perform allocation - if (delay_alloc) { - if (kRowSparseStorage == storage_type) { - // For row sparse, aux_shape indicates the number of rows to allocate - auto aux_shape = aux_shapes[rowsparse::kIdx]; - CHECK_EQ(shape.ndim(), 2) << "High dim RowSparse not yet implemented"; - CheckAndAllocAuxData(rowsparse::kIdx, aux_shape); - TShape storage_shape(shape); - storage_shape[0] = aux_shape[0]; - CheckAndAllocData(storage_shape, dtype); - } else if (kCSRStorage == storage_type) { - CheckAndAllocAuxData(csr::kIndPtr, aux_shapes[csr::kIndPtr]); - CheckAndAllocAuxData(csr::kIdx, aux_shapes[csr::kIdx]); - CheckAndAllocData(aux_shapes[csr::kIdx], dtype); - } else { - LOG(FATAL) << "Storage type " << storage_type << " not implemented for CheckAndAlloc"; - } + if (kRowSparseStorage == storage_type) { + // For row sparse, aux_shape indicates the number of rows to allocate + auto aux_shape = aux_shapes[rowsparse::kIdx]; + CHECK_EQ(shape.ndim(), 2) << "High dim RowSparse not yet implemented"; + CheckAndAllocAuxData(rowsparse::kIdx, aux_shape); + TShape storage_shape(shape); + storage_shape[0] = aux_shape[0]; + CheckAndAllocData(storage_shape, dtype); + } else if (kCSRStorage == storage_type) { + CheckAndAllocAuxData(csr::kIndPtr, aux_shapes[csr::kIndPtr]); + CheckAndAllocAuxData(csr::kIdx, aux_shapes[csr::kIdx]); + CheckAndAllocData(aux_shapes[csr::kIdx], dtype); + } else { + LOG(FATAL) << "Storage type " << storage_type << " not implemented for CheckAndAlloc"; } } // create storage handle for data based on shape and dtype, assuming ctx is set - // shandle and storage shape are updated + // storage shape is also updated + // if data is already allocated, try reuse the storage. Otherwise, free the current one + // and allocate new storage inline void CheckAndAllocData(const TShape &shape, int dtype) { CHECK_NE(aux_shapes.size(), 0) << "data is expected to be allocated after aux_data"; + auto dbytes = shape.Size() * mshadow::mshadow_sizeof(dtype); + if (shandle.size < dbytes) { + // free storage if necessary and alloc again + if (shandle.size > 0) Storage::Get()->Free(shandle); + // init storage + shandle = Storage::Get()->Alloc(dbytes, ctx); + } // init shape storage_shape = shape; - // init storage - auto dbytes = shape.Size() * mshadow::mshadow_sizeof(dtype); - shandle = Storage::Get()->Alloc(dbytes, ctx); // delay_alloc is only set when data storage handle is present delay_alloc = false; } - // create storage handle for aux data based on shape, assuming ctx and aux type are set - // aux_handle and aux shape are updated + // create storage handle for aux data based on shape + // this function assumes ctx, aux shapes and aux types are set + // aux shape is also updated + // if aux data is already allocated, try reuse the storage. Otherwise, free the current one + // and allocate new storage inline void CheckAndAllocAuxData(size_t i, const TShape &shape) { - CHECK_GT(shape.Size(), 0) << "shape cannot be empty in CheckAndAllocAuxData"; CHECK_EQ(shape.ndim(), 1) << "shape must be 1D in CheckAndAllocAuxData"; - CHECK_EQ(aux_shapes.size(), aux_handles.size()); CHECK_NE(storage_type, kUndefinedStorage) << "storage type cannot be kUndefinedStorage in CheckAndAllocAuxData"; CHECK_NE(storage_type, kDefaultStorage) << "storage type cannot be kDefaultStorage in CheckAndAllocAuxData"; - if (aux_shapes.size() <= i) { - aux_shapes.resize(i + 1); + if (aux_handles.size() <= i) { aux_handles.resize(i + 1); } + size_t aux_bytes = shape.Size() * mshadow::mshadow_sizeof(aux_types[i]); + if (aux_handles[i].size < aux_bytes) { + // free storage if necessary and alloc again + if (aux_handles[i].size > 0) Storage::Get()->Free(aux_handles[i]); + // init aux storage + aux_handles[i] = Storage::Get()->Alloc(aux_bytes, ctx); + } // init shape aux_shapes[i] = shape; - // Init aux storage - size_t aux_bytes = shape.Size() * mshadow::mshadow_sizeof(aux_types[i]); - aux_handles[i] = Storage::Get()->Alloc(aux_bytes, ctx); } /*! \brief destructor */ ~Chunk() { @@ -747,13 +788,95 @@ class NDArray { /*! \brief shape of current NDArray */ TShape shape_; /*! \brief offset in chunk */ - size_t offset_; + size_t offset_ = 0; /*! \brief type of data */ int dtype_ = -1; /*! \brief node entry for autograd */ autograd::AGNodeEntry entry_; }; +/*! + * \brief issue an copy operation from one NDArray to another + * the two ndarray can sit on different devices + * this operation will be scheduled by the engine + * + * \param from the ndarray we want to copy data from + * \param to the target ndarray + * \param priority Priority of the action. + * \param alloc_output whether to allocate memory for the output ndarray + * \note The function name explicitly marks the order of from and to + * due to different possible convention carried by copy function. + */ +void CopyFromTo(const NDArray &from, NDArray *to, int priority = 0, bool alloc_output = true); + +// Make a copy of a row-sparse NDArray +template +inline void CopyFromToRspImpl(const NDArray from, NDArray *to, RunContext ctx, bool alloc) { + using namespace mshadow; + CHECK_EQ(from.storage_type(), to->storage_type()) << "Copying with different storage type"; + // if source is zeros, fill destination with zeros, too + auto s = ctx.get_stream(); + if (!from.storage_initialized()) { + op::FillZerosRspImpl(s, to); + return; + } + auto aux_shape = from.aux_shape(rowsparse::kIdx); + if (alloc) to->CheckAndAlloc({aux_shape}); + TBlob val = to->data(); + TBlob idx = to->aux_data(rowsparse::kIdx); + ndarray::Copy(from.data(), &val, + from.ctx(), to->ctx(), ctx); + ndarray::Copy(from.aux_data(rowsparse::kIdx), &idx, + from.ctx(), to->ctx(), ctx); +} + +// Make a copy of a dense NDArray +template +inline void CopyFromToDnsImpl(const NDArray from, NDArray *to, RunContext ctx, bool alloc) { + using namespace mshadow; + CHECK_EQ(from.storage_type(), to->storage_type()) << "Copying with different storage type"; + if (alloc) to->CheckAndAlloc(); + TBlob tmp = to->data(); + ndarray::Copy(from.data(), &tmp, + from.ctx(), to->ctx(), ctx); + auto &gpu_tid = typeid(mshadow::gpu); + if (typeid(from_xpu) == gpu_tid || typeid(to_xpu) == gpu_tid) { + // Wait GPU kernel to complete + ctx.get_stream()->Wait(); + } +} + +// Make a copy of an NDArray based on storage type +template +void CopyFromToImpl(const NDArray from, NDArray *to, RunContext ctx, bool alloc) { + // if storage type doesn't match, cast the storage first + auto from_stype = from.storage_type(); + auto to_stype = to->storage_type(); + NDArray casted_nd; + if (from_stype != to_stype) { + TShape shape = from.shape(); + auto from_ctx = from.ctx(); + auto s = ctx.get_stream(); + // TODO(haibin) inplace conversion + if (to_stype == kDefaultStorage) { + casted_nd = NDArray(shape, from_ctx); + } else { + casted_nd = NDArray(to_stype, shape, from_ctx); + } + op::CastStorageComputeImpl(s, from, casted_nd); + } else { + casted_nd = from; + } + if (to_stype == kDefaultStorage) { + CopyFromToDnsImpl(casted_nd, to, ctx, alloc); + } else if (to_stype == kRowSparseStorage) { + CopyFromToRspImpl(casted_nd, to, ctx, alloc); + } else { + // TODO(haibin) support csr copy. For sliced csr, we want to only copy the related + // indices and values instead of the superset. + LOG(FATAL) << "Not implemented yet"; + } +} /*! * \brief Perform elementwise sum over each data from source, store result into out. diff --git a/include/mxnet/storage.h b/include/mxnet/storage.h index 1b765233947d..e236a9cf313b 100644 --- a/include/mxnet/storage.h +++ b/include/mxnet/storage.h @@ -23,11 +23,11 @@ class Storage { /*! * \brief Pointer to the data. */ - void* dptr; + void* dptr{nullptr}; /*! * \brief Size of the storage. */ - size_t size; + size_t size{0}; /*! * \brief Context information about device and ID. */ diff --git a/python/mxnet/optimizer.py b/python/mxnet/optimizer.py index 124f1f6218ee..fa2c6343f7e5 100644 --- a/python/mxnet/optimizer.py +++ b/python/mxnet/optimizer.py @@ -339,10 +339,10 @@ def update(self, index, weight, grad, state): if state is not None: sparse_sgd_mom_update(weight, grad, state, out=weight, - lr=lr, wd=wd, **self.kwargs) + lr=lr, wd=wd, **self.kwargs) else: sparse_sgd_update(weight, grad, out=weight, - lr=lr, wd=wd, **self.kwargs) + lr=lr, wd=wd, **self.kwargs) @register diff --git a/python/mxnet/sparse_ndarray.py b/python/mxnet/sparse_ndarray.py index be726eac53e6..f8788744a312 100644 --- a/python/mxnet/sparse_ndarray.py +++ b/python/mxnet/sparse_ndarray.py @@ -1,23 +1,21 @@ # coding: utf-8 -# pylint: disable= too-many-lines, redefined-builtin, protected-access -# pylint: disable=import-error, no-name-in-module, undefined-variable -"""NDArray API of mxnet.""" +"""SparseNDArray API of mxnet.""" from __future__ import absolute_import from __future__ import division -# try: -# from __builtin__ import slice as py_slice -# except ImportError: -# from builtins import slice as py_slice +try: + from __builtin__ import slice as py_slice +except ImportError: + from builtins import slice as py_slice import ctypes -# import warnings +import warnings import os as _os import sys as _sys # import operator import numpy as np -from .base import _LIB # , string_types, numeric_types +from .base import _LIB, numeric_types #string_types from .base import c_array, mx_real_t # , py_str, c_str from .base import mx_uint, NDArrayHandle, check_call # from .base import ctypes2buffer @@ -25,7 +23,7 @@ from . import _ndarray_internal as _internal from . import ndarray from .ndarray import _DTYPE_NP_TO_MX, _DTYPE_MX_TO_NP -from .ndarray import _STORAGE_TYPE_ID_TO_STR, _STORAGE_TYPE_STR_TO_ID +from .ndarray import _STORAGE_TYPE_STR_TO_ID#, _STORAGE_TYPE_ID_TO_STR from .ndarray import NDArray # Use different verison of SymbolBase @@ -48,9 +46,8 @@ } -def _new_alloc_handle(storage_type, shape, ctx, delay_alloc=True, - dtype=mx_real_t, aux_types=None): - """Return a new handle with specified shape and context. +def _new_alloc_handle(storage_type, shape, ctx, delay_alloc, dtype, aux_types, aux_shapes=None): + """Return a new handle with specified shape, type and context. Empty handle is only used to hold results @@ -60,7 +57,10 @@ def _new_alloc_handle(storage_type, shape, ctx, delay_alloc=True, A new empty ndarray handle """ hdl = NDArrayHandle() - aux_type_list = [int(_DTYPE_NP_TO_MX[np.dtype(aux_t).type]) for aux_t in aux_types] + aux_type_ids = [int(_DTYPE_NP_TO_MX[np.dtype(aux_t).type]) for aux_t in aux_types] + aux_shapes = [(0,) for aux_t in aux_types] if aux_shapes is None else aux_shapes + aux_shape_lens = [len(aux_shape) for aux_shape in aux_shapes] + aux_shapes = sum(aux_shapes, ()) num_aux = mx_uint(len(aux_types)) check_call(_LIB.MXNDArrayCreateSparseEx( ctypes.c_int(int(_STORAGE_TYPE_STR_TO_ID[storage_type])), @@ -71,18 +71,22 @@ def _new_alloc_handle(storage_type, shape, ctx, delay_alloc=True, ctypes.c_int(int(delay_alloc)), ctypes.c_int(int(_DTYPE_NP_TO_MX[np.dtype(dtype).type])), num_aux, - c_array(ctypes.c_int, aux_type_list), + c_array(ctypes.c_int, aux_type_ids), + c_array(mx_uint, aux_shape_lens), + c_array(mx_uint, aux_shapes), ctypes.byref(hdl))) return hdl class SparseNDArray(NDArray): - ''' sparse ndarray ''' + """An array object representing a multidimensional, homogeneous array of +fixed-size items, stored in sparse format. + + """ __slots__ = [] - # def __repr__(self): - def __reduce__(self): - return SparseNDArray, (None,), self.__getstate__() + #def __reduce__(self): + # return SparseNDArray, (None,), self.__getstate__() def __add__(self, other): raise Exception('Not implemented for SparseND yet!') @@ -163,16 +167,109 @@ def __setstate__(self, state): raise Exception('Not implemented for SparseND yet!') def __setitem__(self, key, value): - raise Exception('Not implemented for SparseND yet!') + """x.__setitem__(i, y) <=> x[i]=y + + Set self[key] to value. + + Parameters + ---------- + key : slice + The indexing key. + value : NDArray or numpy.ndarray + The value to set. + + """ + if not self.writable: + raise ValueError('Failed to assign to a readonly NDArray') + if isinstance(key, py_slice): + if key.step is not None or key.start is not None or key.stop is not None: + raise ValueError('Assignment with slicing not supported in SparseNDArray.') + if isinstance(value, NDArray): + # avoid copying to itself + if value.handle is not self.handle: + value.copyto(self) + elif isinstance(value, numeric_types): + raise Exception("Assigning numeric types to SparseNDArray not supported yet.") + elif isinstance(value, (np.ndarray, np.generic)): + # TODO(haibin) this is not efficient. Implement sync_copyfrom for + # sparse ndarray to avoid an extra copy + warnings.warn('Assigning non-NDArray object to SparseNDArray is not efficient', + RuntimeWarning) + tmp = ndarray.array(value) + tmp.copyto(self) + else: + raise TypeError('type %s not supported' % str(type(value))) + else: + assert(isinstance(key, (int, tuple))) + raise Exception('SparseNDArray only supports [:] for assignment') def __getitem__(self, key): - raise Exception('Not implemented for SparseND yet!') + stype = self.storage_type + assert(stype == 'csr'), "getitem for " + str(stype) + " not implemented yet" + if isinstance(key, int): + raise Exception("Not implemented yet") + if isinstance(key, py_slice): + if key.step is not None: + raise ValueError('NDArray only supports continuous slicing on axis 0') + if key.start is not None or key.stop is not None: + return self._slice(key.start, key.stop) + else: + return self + if isinstance(key, tuple): + raise ValueError('Multi-dimension indexing is not supported') def _sync_copyfrom(self, source_array): raise Exception('Not implemented for SparseND yet!') def _slice(self, start, stop): - raise Exception('Not implemented for SparseND yet!') + """Returns a read-only sliced SparseNDArray that shares memory with current one. + For csr SparseNDArray, it only slices the indptr array, and keeps the original values + and indices. + + The existing slice operation is not very efficient when it's copied, since the indices + and values are a superset of the sliced region. + + + Parameters + ---------- + start : int + Starting index of slice. + stop : int + Finishing index of slice. + + Example + ---------- + >>> indptr = np.array([0, 2, 3, 6]) + >>> indices = np.array([0, 2, 2, 0, 1, 2]) + >>> data = np.array([1, 2, 3, 4, 5, 6]) + >>> a = mx.sparse_nd.csr(data, indptr, indices, (3, 3)) + >>> a.asnumpy() + array([[1, 0, 2], + [0, 0, 3], + [4, 5, 6]]) + + >>> a[1:2].asnumpy() + array([[0, 0, 3]]) + + >>> a[1:2].indptr.asnumpy() + array([[2, 3]]) + + >>> a[1:2].indicies.asnumpy() + array([0, 2, 2, 0, 1, 2]) + + >>> a[1:2].values.asnumpy() + array([1, 2, 3, 4, 5, 6]) + + """ + stype = self.storage_type + assert(stype == 'csr'), "_slice for " + str(stype) + " not implemented yet" + warnings.warn('slicing SparseNDArray is not efficient', RuntimeWarning) + handle = NDArrayHandle() + start = mx_uint(start) if start else mx_uint(0) + stop = mx_uint(stop) if stop else mx_uint(self.shape[0]) + check_call(_LIB.MXNDArraySlice( + self.handle, start, stop, ctypes.byref(handle))) + return SparseNDArray(handle=handle, writable=False) def _at(self, idx): raise Exception('at operator for SparseND is not supported.') @@ -183,200 +280,311 @@ def reshape(self, shape): def broadcast_to(self, shape): raise Exception('Not implemented for SparseND yet!') - # def wait_to_read(self): - # @property - # def shape(self): - def aux_type(self, i): + def _aux_type(self, i): + """Data-type of the array’s ith aux data. + + Returns + ------- + numpy.dtype + This NDArray's data type. + """ aux_type = ctypes.c_int() - check_call(_LIB.MXNDArrayGetAuxType( - self.handle, i, ctypes.byref(aux_type))) + check_call(_LIB.MXNDArrayGetAuxType(self.handle, i, ctypes.byref(aux_type))) return _DTYPE_MX_TO_NP[aux_type.value] @property - def size(self): - raise Exception('Not implemented for SparseND yet!') + def values(self): + return self._data(0) + + @property + def indices(self): + stype = self.storage_type + if stype == 'row_sparse': + return self._aux_data(0) + elif stype == 'csr': + return self._aux_data(1) + raise Exception("unknown storage type " + stype) + + @property + def indptr(self): + stype = self.storage_type + if stype == 'csr': + return self._aux_data(0) + raise Exception("unknown storage type " + stype) - # @property - # def context(self): - # @property - # def dtype(self): @property - def num_aux(self): - num_aux = mx_uint() - check_call(_LIB.MXNDArrayGetNumAux(self.handle, ctypes.byref(num_aux))) - return num_aux.value + def _num_aux(self): + ''' The number of aux data used to help store the sparse ndarray. + ''' + return len(_STORAGE_AUX_TYPES[self.storage_type]) + @property # pylint: disable= invalid-name, undefined-variable def T(self): - raise Exception('Not implemented for SparseND yet!') - # TODO(haibin) Should this be a property? + raise Exception('Transpose is not supported for SparseNDArray.') + + @property def aux_types(self): + ''' The data types of the aux data for the SparseNDArray. + ''' aux_types = [] - num_aux = self.num_aux + num_aux = self._num_aux for i in xrange(num_aux): - aux_types.append(self.aux_type(i)) + aux_types.append(self._aux_type(i)) return aux_types def asnumpy(self): """Return a dense ``numpy.ndarray`` object with value copied from this array - """ - dense_nd = self.to_dense() - return dense_nd.asnumpy() - def asscalar(self): - raise Exception('Not implemented for SparseND yet!') + """ + return self.to_dense().asnumpy() def astype(self, dtype): raise Exception('Not implemented for SparseND yet!') def copyto(self, other): + """Copies the value of this array to another array. + + If ``other`` is a ``NDArray`` object, then ``other.shape`` and + ``self.shape`` should be the same. This function copies the value from + ``self`` to ``other``. + + If ``other`` is a context, a new ``NDArray`` will be first created on + the target context, and the value of ``self`` is copied. + + Parameters + ---------- + other : NDArray or Context + The destination array or context. + + Returns + ------- + NDArray + The copied array. If ``other`` is an ``NDArray``, then the return value + and ``other`` will point to the same ``NDArray``. + """ if isinstance(other, NDArray): if other.handle is self.handle: warnings.warn('You are attempting to copy an array to itself', RuntimeWarning) return return _internal._copyto(self, out=other) elif isinstance(other, Context): - hret = SparseNDArray(_new_alloc_handle(self.storage_type, self.shape, other, True, self.dtype, self.aux_types())) + hret = SparseNDArray(_new_alloc_handle(self.storage_type, self.shape, other, + True, self.dtype, self.aux_types)) return _internal._copyto(self, out=hret) else: raise TypeError('copyto does not support type ' + str(type(other))) - def copy(self): - raise Exception('Not implemented for SparseND yet!') - - def as_in_context(self, context): - raise Exception('Not implemented for SparseND yet!') - def to_dense(self): return to_dense(self) + def _aux_data(self, i, writable=False): + """ Get a reference to the i-th aux data associated with the SparseNDArray. If the + SparseNDArray is not yet compacted, the returned result may include invalid values. -# TODO We need a to_dense method to test it -def csr(values, idx, indptr, shape, ctx=Context.default_ctx, dtype=mx_real_t, aux_types=None): - ''' constructor ''' - hdl = NDArrayHandle() - # TODO currently only supports NDArray input - assert (isinstance(values, NDArray)) - assert (isinstance(indptr, NDArray)) - assert (isinstance(idx, NDArray)) - assert (isinstance(shape, tuple)) - indices = c_array(NDArrayHandle, [indptr.handle, idx.handle]) - num_aux = mx_uint(2) - check_call(_LIB.MXNDArrayCreateSparse( - values.handle, num_aux, indices, - c_array(mx_uint, shape), - mx_uint(len(shape)), - ctypes.c_int(_STORAGE_TYPE_STR_TO_ID['csr']), - ctypes.c_int(ctx.device_typeid), - ctypes.c_int(ctx.device_id), - ctypes.c_int(int(False)), - ctypes.c_int(int(_DTYPE_NP_TO_MX[np.dtype(dtype).type])), - ctypes.byref(hdl))) - return SparseNDArray(hdl) + """ + hdl = NDArrayHandle() + check_call(_LIB.MXNDArrayGetAuxNDArray(self.handle, i, ctypes.byref(hdl))) + return NDArray(hdl, writable) + def _data(self, writable=False): + """ Get a reference to the data associated with the SparseNDArray. If the + SparseNDArray is not yet compacted, the returned result may include invalid values. -# pylint: enable= no-member -# TODO(haibin) also specify aux_types -def row_sparse(values, index, shape, ctx=Context.default_ctx, dtype=mx_real_t, aux_types=None): - ''' rsp constructor which only accepts NDArray as input ''' - hdl = NDArrayHandle() - assert (isinstance(values, NDArray)) - assert (isinstance(index, NDArray)) - indices = c_array(NDArrayHandle, [index.handle]) - num_aux = mx_uint(1) - check_call(_LIB.MXNDArrayCreateSparse( - values.handle, num_aux, indices, - c_array(mx_uint, shape), - mx_uint(len(shape)), - ctypes.c_int(_STORAGE_TYPE_STR_TO_ID['row_sparse']), - ctypes.c_int(ctx.device_typeid), - ctypes.c_int(ctx.device_id), - ctypes.c_int(int(False)), - ctypes.c_int(int(_DTYPE_NP_TO_MX[np.dtype(dtype).type])), - ctypes.byref(hdl))) - return SparseNDArray(hdl) - - -def array(values, index_list, storage_type, shape, ctx=None, dtype=mx_real_t, aux_types=None): - ''' constructor ''' - # TODO check input array types. Assume NDArray class for now - # TODO support other types - # TODO also specify auxtypes - assert (storage_type == 'row_sparse' or storage_type == 'csr') - if aux_types is not None: - assert isinstance(aux_types, list) - assert len(aux_types) == len(index_list) - if not isinstance(values, NDArray): - values = ndarray.array(values) - for i, index in enumerate(index_list): - if not isinstance(index, NDArray): - index_list[i] = ndarray.array(index, dtype=aux_types[i] if aux_types is not None else None) - - if isinstance(shape, int): - shape = (shape,) + """ + hdl = NDArrayHandle() + check_call(_LIB.MXNDArrayGetDataNDArray(self.handle, ctypes.byref(hdl))) + return NDArray(hdl, writable) + + + def compact(self): + raise Exception("Not implemented yet") + +def _prepare_src_array(src, dtype, default_dtype): + if isinstance(src, NDArray): + dtype = src.dtype if dtype is None else dtype + else: + dtype = default_dtype if dtype is None else dtype + if not isinstance(src, np.ndarray): + try: + src = np.array(src, dtype=dtype) + except: + raise TypeError('values must be array like object') + return src, dtype + +def csr(values, indptr, indices, shape, ctx=None, dtype=None, indptr_type=None, indices_type=None): + """Creates a 2D array with compressed sparse row format. + + A SparseNDArray with `csr` storage represents a NDArray as three separate arrays: `values`, + `indptr` and `indices`. It uses the standard CSR representation where the column indices for + row i are stored in indices[indptr[i]:indptr[i+1]] and their corresponding values are stored + in values[indptr[i]:indptr[i+1]]. + + Parameters + ---------- + values: array_like + An object exposing the array interface, with shape [nnz], where D0 is the number of + non-zero entries. + indptr: array_like + An object exposing the array interface, with shape [D0 + 1]. The first element in indptr + should always be zero. + indices: array_like + An object exposing the array interface, with shape [nnz]. + ctx : Context, optional + Device context (default is the current default context). + dtype : str or numpy.dtype, optional + The data type of the output array. The default dtype is ``values.dtype`` + if `values` is an `NDArray`, `float32` otherwise. + indptr_type: str or numpy.dtype, optional + The data type of the indices array. The default dtype is ``indptr.dtype`` + if `indptr` is an `NDArray`, `int32` otherwise. + indices_type: str or numpy.dtype, optional + The data type of the indices array. The default dtype is ``indices.dtype`` + if `indicies` is an `NDArray`, `int32` otherwise. + + Returns + ------- + SparseNDArray + An `SparseNDArray` with the `csr` storage representation. + """ + storage_type = 'csr' + # context if ctx is None: ctx = Context.default_ctx - if storage_type == 'row_sparse': - arr = row_sparse(values, index_list[0], shape, ctx=ctx, dtype=dtype, aux_types=aux_types) - elif storage_type == 'csr': - arr = csr(values, index_list[0], index_list[1], shape, ctx, dtype, aux_types) - else: - raise Exception('Not implemented for SparseND yet!') - return arr + # prepare src array and types + values, dtype = _prepare_src_array(values, dtype, mx_real_t) + indptr, indptr_type = _prepare_src_array(indptr, indptr_type, + _STORAGE_AUX_TYPES[storage_type][0]) + indices, indices_type = _prepare_src_array(indices, indices_type, + _STORAGE_AUX_TYPES[storage_type][1]) + # verify types + assert('int' in str(indptr_type) or 'long' in str(indptr_type)) + assert('int' in str(indices_type) or 'long' in str(indices_type)) + # verify shapes + aux_shapes = [indptr.shape, indices.shape] + assert(values.ndim == 1) + assert(indptr.ndim == 1) + assert(indices.ndim == 1) + assert(len(shape) == 2) + result = SparseNDArray(_new_alloc_handle(storage_type, shape, ctx, False, dtype, + [indptr_type, indices_type], aux_shapes)) + # assign indptr, indices and values + values_ref = result._data(True) + indptr_ref = result._aux_data(0, True) + indices_ref = result._aux_data(1, True) + values_ref[:] = values + indptr_ref[:] = indptr + indices_ref[:] = indices + return result + +def row_sparse(values, indices, shape, ctx=None, dtype=None, indices_type=None): + """Creates a row sparse array with a set of tensor slices at given indices. + + A SparseNDArray with `row_sparse` storage is typically used to represent a subset of a larger + NDArray with `default` storage of shape [LARGE0, D1, .. , DN] where LARGE0 >> D0. The values + in indices are the indices in the first dimension of the slices that have been extracted from + the larger NDArray. + + The corresponding NDArray ``dense`` with `default` storage represented by a ``rsp`` + SparseNDArray with `row_sparse` storage has + + ``dense[rsp.indices[i], :, :, :, ...] = rsp.values[i, :, :, :, ...]`` + + `row_sparse` SparseNDArray is used principally in the definition of gradients for operations + that have sparse gradients (e.g. SparseEmbedding). + Parameters + ---------- + values: array_like + An object exposing the array interface, with shape [D0, D1, .. Dn], where D0 is + the number of rows with non-zeros entries. + indices: array_like + An object exposing the array interface, with shape [D0]. + ctx : Context, optional + Device context (default is the current default context). + dtype : str or numpy.dtype, optional + The data type of the output array. The default dtype is ``values.dtype`` + if `values` is an `NDArray`, `float32` otherwise. + indices_type: str or numpy.dtype, optional + The data type of the indices array. The default dtype is ``indices.dtype`` + if `indicies` is an `NDArray`, `int32` otherwise. + + Returns + ------- + SparseNDArray + An `SparseNDArray` with the `row_sparse` storage representation. + """ + storage_type = 'row_sparse' + # context + if ctx is None: + ctx = Context.default_ctx + # prepare src array and types + values, dtype = _prepare_src_array(values, dtype, mx_real_t) + indices, indices_type = _prepare_src_array(indices, indices_type, + _STORAGE_AUX_TYPES[storage_type][0]) + # verify types + assert('int' in str(indices_type) or 'long' in str(indices_type)) + # verify shapes + assert(values.ndim == len(shape)) + assert(indices.ndim == 1) + result = SparseNDArray(_new_alloc_handle(storage_type, shape, ctx, False, dtype, + [indices_type], [indices.shape])) + # assign indices and values + values_ref = result._data(True) + indices_ref = result._aux_data(0, True) + values_ref[:] = values + indices_ref[:] = indices + return result def to_dense(source): - return ndarray.cast_storage(source, storage_type='default') + """ Return a dense array representation of this SparseNDArray. + Returns + ------- + SparseNDArray + The dense array with default storage + """ + return ndarray.cast_storage(source, storage_type='default') -def zeros(shape, storage_type, ctx=None, dtype=mx_real_t, aux_types=None): +def zeros(storage_type, shape, ctx=None, dtype=None, aux_types=None): """Return a new array of given shape and type, filled with zeros. Parameters ---------- shape : int or tuple of int The shape of the empty array - storage_type: - 'row_sparse', etc + storage_type: string + The storage type of the empty array, such as 'row_sparse', 'csr', etc ctx : Context, optional An optional device context (default is the current default context) dtype : str or numpy.dtype, optional An optional value type (default is `float32`) - aux_types: - [np.int32], etc + aux_types: list of numpy.dtype, optional + An optional type for the aux data for SparseNDArray (default values depends + on the storage type) Returns ------- - NDArray + SparseNDArray A created array - - Examples - -------- - >>> mx.nd.zeros(1).asnumpy() - array([ 0.], dtype=float32) - >>> mx.nd.zeros((1,2), mx.gpu(0)) - - >>> mx.nd.zeros((1,2), mx.gpu(0), 'float16').asnumpy() - array([[ 0., 0.]], dtype=float16) """ if ctx is None: ctx = Context.default_ctx - assert (storage_type == 'row_sparse' or storage_type == 'csr') + + dtype = mx_real_t if dtype is None else dtype if aux_types is None: - if 'row_sparse' == storage_type: - aux_types = _STORAGE_AUX_TYPES['row_sparse'] - elif 'csr' == storage_type: - aux_types = _STORAGE_AUX_TYPES['csr'] - # pylint: disable= no-member, protected-access - out = SparseNDArray(_new_alloc_handle(storage_type, shape, ctx, - aux_types=aux_types)) + if storage_type == 'row_sparse' or storage_type == 'csr': + aux_types = _STORAGE_AUX_TYPES[storage_type] + else: + raise Exception("unknown storage type") + assert(len(aux_types) == len(_STORAGE_AUX_TYPES[storage_type])) + out = SparseNDArray(_new_alloc_handle(storage_type, shape, ctx, True, dtype, aux_types)) return _internal._zeros(shape=shape, ctx=ctx, dtype=dtype, out=out) - # pylint: enable= no-member, protected-access - _STORAGE_TYPE_TO_ND_CLASS = { _STORAGE_TYPE_STR_TO_ID['default']: ndarray.NDArray, _STORAGE_TYPE_STR_TO_ID['row_sparse']: SparseNDArray, _STORAGE_TYPE_STR_TO_ID['csr']: SparseNDArray, } + _init_ndarray_module(_STORAGE_TYPE_TO_ND_CLASS, "mxnet") diff --git a/python/mxnet/symbol.py b/python/mxnet/symbol.py index 4beca64c487c..6d473be9cde2 100644 --- a/python/mxnet/symbol.py +++ b/python/mxnet/symbol.py @@ -1123,11 +1123,8 @@ def simple_bind(self, ctx, else attrs[k]['__storage_type__'] for k in self.list_arguments()} arg_shapes, _, aux_shapes = self.infer_shape(**kwargs) arg_types, _, aux_types = self.infer_type(**type_dict) - # print(storage_type_dict) arg_storage_types, _, _ = \ self.infer_storage_type(**storage_type_dict) - # print("arg_storage_types", arg_storage_types) - # print("out_storage_types", out_storage_types) if arg_shapes is None or arg_types is None: raise ValueError("Input node is not complete") @@ -1147,17 +1144,21 @@ def simple_bind(self, ctx, # alloc space arg_ndarrays = [ # avoid allocating dense ndarrays for sparse inputs - _nd_zeros(shape, dev, dtype=dtype) if storage_type != 'row_sparse' - else _sparse_nd_zeros(shape, storage_type, dev, dtype=dtype) + _nd_zeros(shape, dev, dtype=dtype) if storage_type == 'default' + else _sparse_nd_zeros(storage_type, shape, dev, dtype=dtype) for dtype, dev, shape, storage_type in \ zip(arg_types, arg_ctx, arg_shapes, arg_storage_types)] - # print(arg_ndarrays) if grad_req != 'null': grad_ndarrays = {} for name, shape, dev, dtype in zip( self.list_arguments(), arg_shapes, arg_ctx, arg_types): if not isinstance(grad_req, dict) or grad_req[name] != 'null': - grad_ndarrays[name] = _nd_zeros(shape, dev, dtype=dtype) + # TODO(haibin) temporarily set gradient stype for embedding op + if name != 'embed_weight': + grad_ndarrays[name] = _nd_zeros(shape, dev, dtype=dtype) + else: + grad_ndarrays[name] = _sparse_nd_zeros('row_sparse', shape, + dev, dtype=dtype) else: grad_ndarrays = None diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py index fa799313ed6f..60fa7c1792ca 100644 --- a/python/mxnet/test_utils.py +++ b/python/mxnet/test_utils.py @@ -9,8 +9,10 @@ import os import errno import logging +import scipy as sp import numpy as np import numpy.testing as npt +import numpy.random as rnd import mxnet as mx from .context import cpu, gpu, Context from .ndarray import array @@ -65,29 +67,37 @@ def random_arrays(*shapes): return arrays # TODO(haibin) also include types in arguments -def random_sparse_ndarray(shape, storage_type, sparsity = 0.5, allow_zeros = False): - """Generate a random sparse ndarray.""" +def rand_sparse_ndarray(shape, storage_type, density=None): + """Generate a random sparse ndarray. Returns the ndarray, value(np) and indices(np) """ + density = rnd.rand() if density is None else density if storage_type == 'row_sparse': # TODO(haibin) support high dim sparse ndarray assert(len(shape) < 3) prod = np.prod(shape) - # sample idx - idx_sample = np.random.rand(shape[0]) - idx = np.argwhere(idx_sample > sparsity).flatten() - if idx.shape[0] == 0: - if allow_zeros: - return mx.sparse_nd.zeros(shape, 'row_sparse') - idx = np.array([0]) - # generate random values - num_rows = idx.shape[0] num_cols = long(prod / shape[0]) - value = np.random.rand(num_rows, num_cols) - indices = [idx] - arr = mx.sparse_nd.array(value, indices, storage_type, shape, aux_types=[np.int32]) + # sample index + idx_sample = rnd.rand(shape[0]) + indices = np.argwhere(idx_sample < density).flatten() + if indices.shape[0] == 0: + return mx.sparse_nd.zeros('row_sparse', shape), (np.array([]), np.array([], dtype='int32')) + # generate random values + val = rnd.rand(indices.shape[0], num_cols) + arr = mx.sparse_nd.row_sparse(val, indices, shape, indices_type=np.int32) + return arr, (val, indices) + elif storage_type == 'csr': + assert(len(shape) == 2) + csr = sp.sparse.rand(shape[0], shape[1], density=density, format='csr') + result = mx.sparse_nd.csr(csr.data, csr.indptr, csr.indices, shape) + return result, (csr.indptr, csr.indices, csr.data) else: - raise Exception('Not implemented for SparseND yet!') - return arr + assert(False), "unknown storage type" +def rand_ndarray(shape, storage_type, density=None): + if storage_type == 'default': + arr = mx.nd.array(random_arrays(shape)) + else: + arr, _ = rand_sparse_ndarray(shape, storage_type, density=density) + return arr def np_reduce(dat, axis, keepdims, numpy_reduce_func): """Compatible reduce for old version of NumPy. @@ -281,7 +291,7 @@ def _parse_location(sym, location, ctx): Returns ------- - dict of str to np.ndarray + dict of str to NDArray """ assert isinstance(location, (dict, list, tuple)) if isinstance(location, dict): @@ -592,14 +602,28 @@ def check_symbolic_backward(sym, location, out_grads, expected, rtol=1e-5, atol= if isinstance(expected, (list, tuple)): expected = {k:v for k, v in zip(sym.list_arguments(), expected)} args_grad_npy = {k:_rng.normal(size=v.shape) for k, v in expected.items()} - args_grad_data = {k: mx.nd.array(v, ctx=ctx) for k, v in args_grad_npy.items()} + # args_grad_data should be casted to storage type if hinted + # TODO(haibin) this is a temporary solution for testing. remove later + attrs = sym.attr_dict() + args_grad_data = {} + for k, v in args_grad_npy.items(): + grad_stype = attrs[k].get('grad_stype_hint', None) + nd = mx.nd.array(v, ctx=ctx) + if grad_stype is not None: + out = mx.nd.cast_storage(nd, storage_type=grad_stype) + args_grad_data[k] = out + else: + args_grad_data[k] = nd + if isinstance(grad_req, str): grad_req = {k:grad_req for k in sym.list_arguments()} elif isinstance(grad_req, (list, tuple)): grad_req = {k:v for k, v in zip(sym.list_arguments(), grad_req)} - executor = sym.bind(ctx=ctx, args=location, args_grad=args_grad_data, aux_states=aux_states) + executor = sym.bind(ctx=ctx, args=location, args_grad=args_grad_data, + aux_states=aux_states, grad_req=grad_req) executor.forward(is_train=True) + if isinstance(out_grads, (tuple, list)): out_grads = [mx.nd.array(v, ctx=ctx) for v in out_grads] elif isinstance(out_grads, (dict)): diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index efef3ed2b972..80b6d3688c02 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -131,31 +131,6 @@ int MXNDArrayCreate(const mx_uint *shape, API_END(); } -// TODO(haibin) remove this API -int MXNDArrayCreateSparse(NDArrayHandle data, - mx_uint num_aux, - NDArrayHandle *aux_vec, - const mx_uint *shape, - mx_uint ndim, - int storage_type, - int dev_type, - int dev_id, - int delay_alloc, - int dtype, - NDArrayHandle *out) { - API_BEGIN(); - auto ctx = Context::Create(static_cast(dev_type), dev_id); - std::vector aux_ndarrays; - NDArray* data_ptr = reinterpret_cast(data); - for (size_t i = 0; i < num_aux; i++) { - NDArray* nd_aux_ptr = reinterpret_cast(aux_vec[i]); - aux_ndarrays.push_back(*nd_aux_ptr); - } - NDArrayStorageType stype = (NDArrayStorageType) storage_type; - *out = new NDArray(*data_ptr, aux_ndarrays, ctx, stype, TShape(shape, shape + ndim)); - API_END(); -} - int MXNDArrayCreateEx(const mx_uint *shape, mx_uint ndim, int dev_type, @@ -181,16 +156,26 @@ int MXNDArrayCreateSparseEx(int storage_type, int dtype, mx_uint num_aux, int *aux_type, + mx_uint *aux_ndims, + const mx_uint *aux_shape, NDArrayHandle *out) { API_BEGIN(); std::vector aux_types; - for (size_t i = 0; i < num_aux; i++) aux_types.push_back(aux_type[i]); + std::vector aux_shapes; + auto shape_start = aux_shape; + for (size_t i = 0; i < num_aux; i++) { + // types + aux_types.push_back(aux_type[i]); + // shapes + aux_shapes.emplace_back(shape_start, shape_start + aux_ndims[i]); + shape_start += aux_ndims[i]; + } *out = new NDArray( NDArrayStorageType(storage_type), TShape(shape, shape + ndim), Context::Create(static_cast(dev_type), dev_id), delay_alloc != 0, - dtype, aux_types); + dtype, aux_types, aux_shapes); API_END(); } @@ -409,17 +394,27 @@ int MXNDArrayGetDType(NDArrayHandle handle, int MXNDArrayGetAuxType(NDArrayHandle handle, mx_uint i, - int *out_aux_type) { + int *out_type) { + API_BEGIN(); + NDArray *arr = static_cast(handle); + *out_type = arr->aux_type(i); + API_END(); +} + +int MXNDArrayGetAuxNDArray(NDArrayHandle handle, + mx_uint i, + NDArrayHandle *out) { API_BEGIN(); NDArray *arr = static_cast(handle); - *out_aux_type = arr->aux_type(i); + *out = new NDArray(arr->AuxNDArray(i)); API_END(); } -int MXNDArrayGetNumAux(NDArrayHandle handle, mx_uint *out_num_aux) { +int MXNDArrayGetDataNDArray(NDArrayHandle handle, + NDArrayHandle *out) { API_BEGIN(); NDArray *arr = static_cast(handle); - *out_num_aux = arr->num_aux(); + *out = new NDArray(arr->DataNDArray()); API_END(); } diff --git a/src/c_api/c_api_ndarray.cc b/src/c_api/c_api_ndarray.cc index 784d59ed8638..69266cefabee 100644 --- a/src/c_api/c_api_ndarray.cc +++ b/src/c_api/c_api_ndarray.cc @@ -249,8 +249,8 @@ void SetDependency(std::vector *p_read_vars, } CHECK_LE(ntmp, 1) << "Only support 1 temp space request"; } - common::PrepVars(ndinputs, &read_vars); - common::PrepVars(ndoutputs, &write_vars); + for (auto& i : ndinputs) read_vars.emplace_back(i.var()); + for (auto& i : ndoutputs) write_vars.emplace_back(i.var()); if (mutate.count(op)) { auxidx = mutate[op](attrs); std::sort(auxidx.begin(), auxidx.end()); @@ -278,21 +278,21 @@ void PushFCompute(const FCompute& fn, RunContext rctx, engine::CallbackOnComplete on_complete) { std::vector input_blobs, output_blobs; - std::vector tmp_nds; + std::vector tmps; OpContext opctx{is_train, rctx, engine::CallbackOnComplete(), requested}; if (ctx.dev_mask() == gpu::kDevMask) { #if MXNET_USE_CUDA - common::PrepDefaultBlobs(ndinputs, ndoutputs, &input_blobs, - &output_blobs, &tmp_nds, true, opctx); + common::GetInputBlobs(ndinputs, &input_blobs, &tmps, opctx); + common::GetOutputBlobs(ndoutputs, &output_blobs, true); #else LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; #endif } else { - common::PrepDefaultBlobs(ndinputs, ndoutputs, &input_blobs, - &output_blobs, &tmp_nds, true, opctx); + common::GetInputBlobs(ndinputs, &input_blobs, &tmps, opctx); + common::GetOutputBlobs(ndoutputs, &output_blobs, true); } std::vector req(output_blobs.size(), kWriteTo); fn(attrs, opctx, input_blobs, req, output_blobs); diff --git a/src/common/utils.h b/src/common/utils.h index d99c097a84c8..c46a27862054 100644 --- a/src/common/utils.h +++ b/src/common/utils.h @@ -37,33 +37,29 @@ namespace common { #if DMLC_USE_CXX11 template -inline void PrepDefaultBlobs(const std::vector& ndinputs, - const std::vector& ndoutputs, - std::vector *input_blobs, - std::vector *output_blobs, - std::vector *tmp_nds, - bool alloc_outputs, - const OpContext& ctx) { - for (auto& i : ndinputs) { - if (i.storage_type() != kDefaultStorage) { - NDArray tmp_nd(i.shape(), i.ctx(), false); - op::CastStorageComputeEx({}, ctx, {i}, {}, {tmp_nd}); - tmp_nds->push_back(tmp_nd); - input_blobs->push_back(tmp_nd.data()); +inline void GetInputBlobs(const std::vector& nds, + std::vector *blobs, + std::vector *temps, + const OpContext& ctx) { + for (auto& nd : nds) { + if (nd.storage_type() != kDefaultStorage) { + NDArray temp(nd.shape(), nd.ctx(), false); + op::CastStorageComputeEx({}, ctx, {nd}, {}, {temp}); + temps->push_back(temp); + blobs->push_back(temp.data()); } else { - input_blobs->push_back(i.data()); + blobs->push_back(nd.data()); } } - for (auto& i : ndoutputs) { - if (alloc_outputs) i.CheckAndAlloc(); - output_blobs->push_back(i.data()); - } } -inline void PrepVars(const std::vector &nds, - std::vector *vars) { - for (auto& i : nds) { - vars->push_back(i.var()); +template +inline void GetOutputBlobs(const std::vector& nds, + std::vector *blobs, + bool alloc) { + for (auto& nd : nds) { + if (alloc) nd.CheckAndAlloc(); + blobs->push_back(nd.data()); } } diff --git a/src/executor/attach_op_execs_pass.cc b/src/executor/attach_op_execs_pass.cc index 70db4d16fee0..f690c9b03e4c 100644 --- a/src/executor/attach_op_execs_pass.cc +++ b/src/executor/attach_op_execs_pass.cc @@ -8,14 +8,15 @@ #include #include #include "./exec_pass.h" +#include "../common/utils.h" #if MXNET_USE_MKL2017 == 1 #include #include "../operator/mkl/mkl_memory-inl.h" #include "../operator/mkl/mkl_util-inl.h" #endif -#include "../common/utils.h" -#define EXEC_DISPATCH_DEBUG 0 +#define EXEC_ATTACH_OP_DEBUG 0 + namespace mxnet { namespace op { @@ -29,6 +30,26 @@ class ForwardOpExecutor : public OpExecutor { public: void Run(RunContext rctx, bool is_gpu) override { op_ctx.run_ctx = rctx; + + // TODO(haibin) ForwardOp is stateful. If any input ndarray has non-default storage, + // we need to cast it to default storage and setup the tblobs again. For example, + // if any of the input ndarray chagnes, the updated value won't be reflected in the temporary + // ndarray with default storage. This is not efficient and should be improved later. + in_data_.clear(); out_data_.clear(); aux_data_.clear(); tmps_.clear(); + if (is_gpu) { +#if MXNET_USE_CUDA + common::GetInputBlobs(in_array_, &in_data_, &tmps_, op_ctx); + common::GetInputBlobs(aux_array_, &aux_data_, &tmps_, op_ctx); + common::GetOutputBlobs(out_array, &out_data_, true); +#else + LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; +#endif + } else { + common::GetInputBlobs(in_array_, &in_data_, &tmps_, op_ctx); + common::GetInputBlobs(aux_array_, &aux_data_, &tmps_, op_ctx); + common::GetOutputBlobs(out_array, &out_data_, true); + } + op_->Forward(op_ctx, in_data_, req, out_data_, aux_data_); #if MKL_EXPERIMENTAL == 1 mkl_tblobs_prv_to_cpu(in_data_); @@ -38,18 +59,14 @@ class ForwardOpExecutor : public OpExecutor { } void Setup() override { - in_data_.clear(); aux_data_.clear(); + // We need to tell whether in NDArray is input or aux for (size_t i = 0; i < in_array.size(); ++i) { if (!std::binary_search(aux_index_.begin(), aux_index_.end(), i)) { - in_data_.push_back(in_array[i].data()); + in_array_.emplace_back(in_array[i]); } else { - aux_data_.push_back(in_array[i].data()); + aux_array_.emplace_back(in_array[i]); } } - out_data_.resize(out_array.size()); - std::transform(out_array.begin(), out_array.end(), out_data_.begin(), [](const NDArray& nd) { - return nd.data(); - }); } Operator::ExecType exec_type() const override { return op_->exec_type(); @@ -65,6 +82,7 @@ class ForwardOpExecutor : public OpExecutor { std::shared_ptr op_; std::vector aux_index_; std::vector in_data_, out_data_, aux_data_; + std::vector in_array_, aux_array_, tmps_; }; // backward executor @@ -140,19 +158,22 @@ class FComputeExecutor : public OpExecutor { public: void Run(RunContext rctx, bool is_gpu) override { op_ctx.run_ctx = rctx; - if (!initialized) { + // setup blobs + // TODO(haibin) we should avoid repeating this if it's known that all inputs are in + // default-storage. + { + in_data_.clear(); out_data_.clear(); if (is_gpu) { #if MXNET_USE_CUDA - common::PrepDefaultBlobs(in_array, out_array, &in_data_, - &out_data_, &tmp_nds_, true, op_ctx); + common::GetInputBlobs(in_array, &in_data_, &tmp_nds_, op_ctx); + common::GetOutputBlobs(out_array, &out_data_, true); #else LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; #endif } else { - common::PrepDefaultBlobs(in_array, out_array, &in_data_, - &out_data_, &tmp_nds_, true, op_ctx); + common::GetInputBlobs(in_array, &in_data_, &tmp_nds_, op_ctx); + common::GetOutputBlobs(out_array, &out_data_, true); } - initialized = true; } fcompute_(attrs_, op_ctx, in_data_, req, out_data_); #if MKL_EXPERIMENTAL == 1 @@ -160,10 +181,7 @@ class FComputeExecutor : public OpExecutor { mkl_tblobs_prv_to_cpu(out_data_); #endif } - void Setup() override { - in_array_ = in_array; - out_array_ = out_array; - } + void Setup() override {} Operator::ExecType exec_type() const override { return Operator::kSync; } @@ -175,8 +193,7 @@ class FComputeExecutor : public OpExecutor { FCompute fcompute_; NodeAttrs attrs_; std::vector in_data_, out_data_; - std::vector in_array_, out_array_, tmp_nds_; - bool initialized = false; + std::vector tmp_nds_; }; // fcomputend executor @@ -236,7 +253,7 @@ Graph AttachOpExecs(Graph g) { FCompute fcompute = common::GetFCompute(inode.source->op(), vctx[i]); FComputeEx fcompute_ex = common::GetFComputeEx(inode.source->op(), vctx[i], dispatch_stypes[i]); -#if EXEC_DISPATCH_DEBUG +#if EXEC_ATTACH_OP_DEBUG LOG(INFO) << "dispatch type = " << dispatch_stypes[i]; #endif if (fcreate_layer_op.count(inode.source->op())) { @@ -254,22 +271,30 @@ Graph AttachOpExecs(Graph g) { inode.source->attrs, vctx[i], ishape, itype)); } ret[i] = std::make_shared(opr, mutate_index); +#if EXEC_ATTACH_OP_DEBUG + LOG(INFO) << "ForwardOp for op " << inode.source->op()->name; +#endif } else if (is_layer_backward.get(inode.source->op(), false)) { CHECK_GE(inode.control_deps.size(), 1); uint32_t fwd_id = inode.control_deps[0]; CHECK(vctx[fwd_id] == vctx[i]); CHECK(ret[fwd_id] != nullptr); + CHECK_EQ(dispatch_stypes[i], kDefaultStorage) + << "BackwardOp doesn't handle non-default storage yet"; ret[i] = std::make_shared( dynamic_cast(ret[fwd_id].get())->op_, mxnet::op::OpPropGetOpProperty(inode.source->attrs), mutate_index); +#if EXEC_ATTACH_OP_DEBUG + LOG(INFO) << "BackwardOp for op " << inode.source->op()->name; +#endif } else if (fcompute_ex != nullptr) { -#if EXEC_DISPATCH_DEBUG +#if EXEC_ATTACH_OP_DEBUG LOG(INFO) << "FComputeEx for op " << inode.source->op()->name; #endif ret[i] = std::make_shared(fcompute_ex, inode.source->attrs); } else if (fcompute != nullptr) { -#if EXEC_DISPATCH_DEBUG +#if EXEC_ATTACH_OP_DEBUG LOG(INFO) << "FCompute for op " << inode.source->op()->name; #endif ret[i] = std::make_shared(fcompute, inode.source->attrs); diff --git a/src/executor/exec_pass.h b/src/executor/exec_pass.h index b23f7fa47fc9..20535be320d9 100644 --- a/src/executor/exec_pass.h +++ b/src/executor/exec_pass.h @@ -32,7 +32,7 @@ const int kNonDefaultStorage = -2; */ class OpExecutor { public: - /*! \brief input arrays */ + /*! \brief input data arrays, which may be either input or aux */ std::vector in_array; /*! \brief output data arrays */ std::vector out_array; diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc index 5e63a04f7140..e8fd1ed390da 100644 --- a/src/executor/graph_executor.cc +++ b/src/executor/graph_executor.cc @@ -403,29 +403,34 @@ Graph GraphExecutor::InitGraph(nnvm::Symbol symbol, size_t arg_top = 0, aux_top = 0; for (size_t i = 0; i < num_forward_inputs_; ++i) { const uint32_t nid = idx.input_nodes().at(i); + size_t eid = idx.entry_id(nid, 0); if (mutable_nodes.count(nid)) { CHECK_LT(aux_top, aux_states.size()); - data_entry_[idx.entry_id(nid, 0)] = aux_states[aux_top]; + data_entry_[eid] = aux_states[aux_top]; arg_shapes.push_back(aux_states[aux_top].shape()); arg_types.push_back(aux_states[aux_top].dtype()); arg_storage_types.push_back(aux_states[aux_top].storage_type()); ++aux_top; } else { CHECK_LT(arg_top, in_args.size()); - data_entry_[idx.entry_id(nid, 0)] = in_args[arg_top]; + data_entry_[eid] = in_args[arg_top]; arg_shapes.push_back(in_args[arg_top].shape()); arg_types.push_back(in_args[arg_top].dtype()); arg_storage_types.push_back(in_args[arg_top].storage_type()); ++arg_top; } - // LOG(INFO) << "update data_entry_[ " << idx.entry_id(nid, 0) << "]" - // << " " << data_entry_[idx.entry_id(nid, 0)].storage_type(); +#if EXECUTOR_DEBUG + LOG(INFO) << "assign data entry " << eid << "\tas stype " + << data_entry_[eid].storage_type() << " (input)"; +#endif } for (size_t j = num_forward_outputs_; j < idx.outputs().size(); ++j) { - data_entry_[idx.entry_id(idx.outputs()[j])] - = grad_store_[j - num_forward_outputs_].second; - // LOG(INFO) << "update data_entry_[ " << idx.entry_id(idx.outputs()[j]) << "]" - // << " " << data_entry_[idx.entry_id(idx.outputs()[j])].storage_type() << "(output)"; + auto eid = idx.entry_id(idx.outputs()[j]); + data_entry_[eid] = grad_store_[j - num_forward_outputs_].second; +#if EXECUTOR_DEBUG + LOG(INFO) << "assign data entry " << eid << "\tas stype " + << data_entry_[eid].storage_type() << " (output)"; +#endif } arg_shapes.resize(idx.input_nodes().size(), TShape()); arg_types.resize(idx.input_nodes().size(), -1); @@ -520,17 +525,19 @@ void GraphExecutor::InitDataEntryMemory(std::vector* shared_pool) { uint32_t nid = idx.input_nodes().at(i); uint32_t oid = head_grad_map_.at(idx[nid].source); uint32_t eid = idx.entry_id(idx.outputs()[oid]); - NDArrayStorageType storage_type = (NDArrayStorageType) vstorage_type[eid]; + NDArrayStorageType stype = (NDArrayStorageType) vstorage_type[eid]; CHECK_NE(vshape[eid].ndim(), 0U); CHECK_NE(vdtype[eid], -1); - // init NDArray based on storage_type - if (storage_type != kDefaultStorage) { - data_entry_[idx.entry_id(nid, 0)] = - NDArray(storage_type, vshape[eid], data_context[eid], true, vdtype[eid]); + auto data_eid = idx.entry_id(nid, 0); + // initialize based on storage_type + if (stype != kDefaultStorage) { + data_entry_[data_eid] = NDArray(stype, vshape[eid], data_context[eid], true, vdtype[eid]); } else { - data_entry_[idx.entry_id(nid, 0)] = - NDArray(vshape[eid], data_context[eid], false, vdtype[eid]); + data_entry_[data_eid] = NDArray(vshape[eid], data_context[eid], false, vdtype[eid]); } +#if EXECUTOR_DEBUG + LOG(INFO) << "init data entry " << data_eid << "\tas stype " << stype << "(head_grad)"; +#endif } // get maximum bytes in each pool for (size_t i = 0; i < vshape.size(); ++i) { @@ -546,7 +553,6 @@ void GraphExecutor::InitDataEntryMemory(std::vector* shared_pool) { if (info.bytes == 0) { info = PoolEntry{data_context[i], bytes, data_storage_type[i]}; } else { - // std::cout << "WARNING Updated info.bytes" << std::endl; info.bytes = std::max(info.bytes, bytes); } } @@ -600,7 +606,6 @@ void GraphExecutor::InitDataEntryMemory(std::vector* shared_pool) { } } // assign the data entries - for (size_t i = 0; i < data_entry_.size(); ++i) { // avoid pre-allocated arrays if (!data_entry_[i].is_none()) continue; @@ -612,8 +617,11 @@ void GraphExecutor::InitDataEntryMemory(std::vector* shared_pool) { const NDArray& src = data_pool_.at(storage_id); data_entry_[i] = src.AsArray(vshape[i], vdtype[i]); } else { - data_entry_[i] = NDArray(storage_type, vshape[i], vctx[i]); + data_entry_[i] = NDArray(storage_type, vshape[i], data_context[i]); } +#if EXECUTOR_DEBUG + LOG(INFO) << "init data entry " << i << " as stype " << storage_type; +#endif } } @@ -871,7 +879,7 @@ void GraphExecutor::RunOps(bool is_train, size_t topo_start, size_t topo_end) { bool profiling = false; #endif #if EXECUTOR_DEBUG - LOG(INFO) << "Running node " << nid << " - " << seg_op.topo_end - 1; + LOG(INFO) << "Run node " << nid << " - " << seg_op.topo_end - 1; #endif Engine::Get()->Push(seg_op.opr, seg_op.ctx, 0, profiling); nid = seg_op.topo_end - 1; @@ -884,7 +892,7 @@ void GraphExecutor::RunOps(bool is_train, size_t topo_start, size_t topo_end) { if (op_nodes_[nid].skip_exec_node) continue; opnode.exec->op_ctx.is_train = is_train; #if EXECUTOR_DEBUG - LOG(INFO) << "Running node " << nid; + LOG(INFO) << "Run node " << nid; #endif if (opnode.exec->exec_type() == Operator::kCrossDeviceCopy) { CHECK_EQ(inode.inputs.size(), 1U); diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 94616855459c..e9f3195feed0 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -11,6 +11,7 @@ #include #include #include "./ndarray_function.h" +#include "../operator/tensor/init_op.h" #if MXNET_USE_OPENCV #include @@ -232,12 +233,12 @@ void ScalarOp(const NDArray &lhs, } } + void CopyFromTo(const NDArray &from, NDArray *to, int priority, bool alloc_output) { if (from.var() == to->var()) { // skip to copy to itself return; } - CHECK_EQ(from.storage_type(), to->storage_type()) << "Copying with different storage type"; CHECK(from.shape() == to->shape()) << "operands shape mismatch" << "from.shape = " << from.shape() << " to.shape=" << to->shape(); @@ -253,60 +254,28 @@ void CopyFromTo(const NDArray &from, NDArray *to, int priority, bool alloc_outpu if (a == cpu::kDevMask && b == cpu::kDevMask) { Engine::Get()->PushSync([from, ret, alloc](RunContext ctx) { - auto storage_type = from.storage_type(); - if (storage_type == kDefaultStorage) { - if (alloc) ret.CheckAndAlloc(); - TBlob tmp = ret.data(); - ndarray::Copy(from.data(), &tmp, - from.ctx(), ret.ctx(), ctx); - } else if (storage_type == kRowSparseStorage) { - auto aux_shape = from.aux_shape(0); - if (aux_shape.Size() == 0) return; - if (alloc) ret.CheckAndAlloc({aux_shape}); - TBlob val = ret.data(); - TBlob idx = ret.aux_data(rowsparse::kIdx); - ndarray::Copy(from.data(), &val, - from.ctx(), ret.ctx(), ctx); - ndarray::Copy(from.aux_data(rowsparse::kIdx), &idx, - from.ctx(), ret.ctx(), ctx); - } else { - LOG(FATAL) << "Not implemented yet"; - } + NDArray nd(ret); + CopyFromToImpl(from, &nd, ctx, alloc); }, from.ctx(), const_vars, {ret.var()}, FnProperty::kNormal, priority, PROFILER_MESSAGE("CopyCPU2CPU")); } else { #if MXNET_USE_CUDA if (a == cpu::kDevMask && b == gpu::kDevMask) { Engine::Get()->PushSync([from, ret, alloc](RunContext ctx) { - if (from.storage_type() != kDefaultStorage) LOG(FATAL) << "GPU not implemented yet"; - if (alloc) ret.CheckAndAlloc(); - TBlob tmp = ret.data(); - ndarray::Copy(from.data(), &tmp, - from.ctx(), ret.ctx(), ctx); - // Wait GPU kernel to complete - ctx.get_stream()->Wait(); + NDArray nd(ret); + CopyFromToImpl(from, &nd, ctx, alloc); }, ret.ctx(), const_vars, {ret.var()}, FnProperty::kCopyToGPU, priority, PROFILER_MESSAGE("CopyCPU2GPU")); } else if (a == gpu::kDevMask && b == cpu::kDevMask) { Engine::Get()->PushSync([from, ret, alloc](RunContext ctx) { - if (from.storage_type() != kDefaultStorage) LOG(FATAL) << "GPU not implemented yet"; - if (alloc) ret.CheckAndAlloc(); - TBlob tmp = ret.data(); - ndarray::Copy(from.data(), &tmp, - from.ctx(), ret.ctx(), ctx); - // Wait GPU kernel to complete - ctx.get_stream()->Wait(); + NDArray nd(ret); + CopyFromToImpl(from, &nd, ctx, alloc); }, from.ctx(), const_vars, {ret.var()}, FnProperty::kCopyFromGPU, priority, PROFILER_MESSAGE("CopyGPU2CPU")); } else if (a == gpu::kDevMask && b == gpu::kDevMask) { Engine::Get()->PushSync([from, ret, alloc](RunContext ctx) { - if (from.storage_type() != kDefaultStorage) LOG(FATAL) << "GPU not implemented yet"; - if (alloc) ret.CheckAndAlloc(); - TBlob tmp = ret.data(); - ndarray::Copy(from.data(), &tmp, - from.ctx(), ret.ctx(), ctx); - // Wait GPU kernel to complete - ctx.get_stream()->Wait(); + NDArray nd(ret); + CopyFromToImpl(from, &nd, ctx, alloc); }, from.ctx(), const_vars, {ret.var()}, from.dtype() != ret.dtype() ? FnProperty::kNormal : FnProperty::kCopyFromGPU, priority, PROFILER_MESSAGE("CopyGPU2GPU")); diff --git a/src/operator/elemwise_op_common.h b/src/operator/elemwise_op_common.h index c8a7b0c034e5..f4315b62a6a8 100644 --- a/src/operator/elemwise_op_common.h +++ b/src/operator/elemwise_op_common.h @@ -61,12 +61,10 @@ template *in_attrs, std::vector *out_attrs) { - // LOG(INFO) << "ElemwiseStorageAttr for " << attrs.name; auto deduce = [&](std::vector *vec, const char *name, AttrType& result, bool fallback) { auto &v = *vec; for (size_t i = 0; i < vec->size(); ++i) { - // LOG(INFO) << "deduce " << (*vec)[i]; if (v[i] == kUndefinedStorage) { // if input type is unknown, assume it's default storage CHECK(assign(&v[i], kDefaultStorage)); @@ -123,12 +121,16 @@ inline bool ElemwiseStorageType(const nnvm::NodeAttrs& attrs, } inline bool IdentityAttrLikeRhsStorageType(const nnvm::NodeAttrs& attrs, - std::vector *in_attrs, - std::vector *out_attrs) { + std::vector *in_attrs, + std::vector *out_attrs) { CHECK_EQ(in_attrs->size(), static_cast(2)) << " in operator " << attrs.name; CHECK_EQ(out_attrs->size(), static_cast(1)) << " in operator " << attrs.name; - return ElemwiseStorageAttr( - attrs, in_attrs, out_attrs); + auto &in = *in_attrs; + auto &out = *out_attrs; + CHECK_NE(in[1], kUndefinedStorage) << "rhs storage type must be known"; + if (in[0] == kUndefinedStorage) in[0] = in[1]; + if (out[0] == kUndefinedStorage) out[0] = in[1]; + return true; } // Transfer gradient and input to FGradient function @@ -163,6 +165,22 @@ struct ElemwiseGradUseNone { } }; +// TODO(haibin) this is a temporary function for debugging purpose. Remove later. +template +void print_info(const mshadow::Tensor& tensor, const std::string& name) { + std::cout << "Tensor " << name << " with shape ("; + int len = 1; + for (int i = 0; i < dim; i++) { + len *= tensor.shape_[i]; + std::cout << tensor.shape_[i] << ","; + if (i == dim - 1) std::cout << ")"; + } + std::cout << std::endl; + for (int j = 0; j < len; j ++) std::cout << tensor.dptr_[j] << " "; + std::cout << std::endl; +} + + } // namespace op } // namespace mxnet diff --git a/src/operator/operator_common.h b/src/operator/operator_common.h index 205a6e337a2a..78c03685f36f 100755 --- a/src/operator/operator_common.h +++ b/src/operator/operator_common.h @@ -313,17 +313,17 @@ inline void ParamParser(nnvm::NodeAttrs* attrs) { } template -void FComputeExFallback(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs, - FCompute fcompute) { - std::vector input_blobs, output_blobs; - std::vector tmp_nds; - common::PrepDefaultBlobs(inputs, outputs, &input_blobs, - &output_blobs, &tmp_nds, false, ctx); - fcompute(attrs, ctx, input_blobs, req, output_blobs); +void FCompExFallback(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs, + FCompute fcompute) { + std::vector in_blobs, out_blobs; + std::vector tmps; + common::GetInputBlobs(inputs, &in_blobs, &tmps, ctx); + common::GetOutputBlobs(outputs, &out_blobs, false); + fcompute(attrs, ctx, in_blobs, req, out_blobs); } diff --git a/src/operator/optimizer_op-inl.h b/src/operator/optimizer_op-inl.h index a4115f5960eb..04d7d03a52ba 100755 --- a/src/operator/optimizer_op-inl.h +++ b/src/operator/optimizer_op-inl.h @@ -111,8 +111,9 @@ struct SparseSGDDnsRspKernel { // IType is row sparse idx type // i is the ith row in row sparse gradient template - MSHADOW_XINLINE static void Map(int i, size_t width, DType* out, const DType* weight, const IType* grad_idx, - const DType *grad_val, const DType clip_gradient, const DType lr, + MSHADOW_XINLINE static void Map(int i, size_t width, DType* out, const DType* weight, + const IType* grad_idx, const DType *grad_val, + const DType clip_gradient, const DType lr, const DType wd, const DType rescale_grad) { for (size_t j = 0; j < width; j++) { uint64_t data_i = grad_idx[i] * width + j; @@ -159,7 +160,6 @@ inline void SparseSGDUpdateDnsRspImpl(const SGDParam& param, static_cast(param.clip_gradient), static_cast(param.lr), static_cast(param.wd), static_cast(param.rescale_grad)); - }); }); }); diff --git a/src/operator/optimizer_op.cc b/src/operator/optimizer_op.cc index 27c39db864f5..9a776a94bf0c 100644 --- a/src/operator/optimizer_op.cc +++ b/src/operator/optimizer_op.cc @@ -46,7 +46,7 @@ It updates the weights using:: .set_attr_parser(ParamParser) .set_attr("FInferShape", ElemwiseShape<2, 1>) .set_attr("FInferType", ElemwiseType<2, 1>) -// TODO write FCompute for sparse sgd +// TODO(haibin) write FCompute for sparse sgd // .set_attr("FCompute", SGDUpdate) .set_attr(FCOMP_EX_CPU, SparseSGDUpdateEx) .add_argument("weight", "NDArray-or-Symbol", "Weight") diff --git a/src/operator/optimizer_op.cu b/src/operator/optimizer_op.cu index 2b2667ec317b..24c2d8d7fd02 100644 --- a/src/operator/optimizer_op.cu +++ b/src/operator/optimizer_op.cu @@ -15,6 +15,9 @@ NNVM_REGISTER_OP(sgd_update) NNVM_REGISTER_OP(sgd_mom_update) .set_attr("FCompute", SGDMomUpdate); +NNVM_REGISTER_OP(sparse_sgd_update) +.set_attr(FCOMP_EX_GPU, SparseSGDUpdateEx); + NNVM_REGISTER_OP(adam_update) .set_attr("FCompute", AdamUpdate); diff --git a/src/operator/tensor/elemwise_binary_op.h b/src/operator/tensor/elemwise_binary_op.h index 0b4ce1a6b381..362e6b4805e0 100644 --- a/src/operator/tensor/elemwise_binary_op.h +++ b/src/operator/tensor/elemwise_binary_op.h @@ -41,42 +41,35 @@ void BinaryComputeRspRsp(const nnvm::NodeAttrs& attrs, const std::vector& inputs, const std::vector& req, const std::vector& outputs) { - CHECK_EQ(inputs.size(), 2); - CHECK_EQ(outputs.size(), 1); - auto &nd_l = inputs[0]; - auto &nd_r = inputs[1]; + auto &lhs = inputs[0]; + auto &rhs = inputs[1]; auto &output = outputs[0]; - CHECK_EQ(nd_l.storage_type(), kRowSparseStorage) << "Sparse type not supported yet"; - // Memory Estimation - unsigned int num_rows_l = nd_l.aux_shape(rowsparse::kIdx).Size(); - unsigned int num_rows_r = nd_r.aux_shape(rowsparse::kIdx).Size(); - // TODO(haibin) both zero? - if (num_rows_l + num_rows_r == 0) { + bool init_l = lhs.storage_initialized(); + bool init_r = rhs.storage_initialized(); + // both inputs are zeros + if (!init_l && !init_r) return; + // one of the input is zeros + if (!init_l || !init_r) { + NDArray out(output); + CopyFromToRspImpl(!init_l ? rhs : lhs, &out, ctx.run_ctx, true); return; } - // This is (roughly) the number of result rows + // Memory Estimation: This is (roughly) the number of result rows. We still + // need to subtract the number of common rows + unsigned int num_rows_l = lhs.aux_shape(rowsparse::kIdx).Size(); + unsigned int num_rows_r = rhs.aux_shape(rowsparse::kIdx).Size(); output.CheckAndAlloc({TShape({num_rows_l + num_rows_r})}); mshadow::Stream *s = ctx.get_stream(); - if (num_rows_l == 0) { - NDArray out(output); - CopyFromTo(nd_r, &out); - return; - } - if (num_rows_r == 0) { - NDArray out(output); - CopyFromTo(nd_l, &out); - return; - } MSHADOW_TYPE_SWITCH(output.dtype(), DType, { - MSHADOW_TYPE_SWITCH(nd_l.aux_type(rowsparse::kIdx), IType, { + MSHADOW_TYPE_SWITCH(lhs.aux_type(rowsparse::kIdx), IType, { // Indices - auto indices_l = nd_l.aux_data(rowsparse::kIdx).FlatTo1D(s); - auto indices_r = nd_r.aux_data(rowsparse::kIdx).FlatTo1D(s); + auto indices_l = lhs.aux_data(rowsparse::kIdx).FlatTo1D(s); + auto indices_r = rhs.aux_data(rowsparse::kIdx).FlatTo1D(s); auto indices_out = output.aux_data(rowsparse::kIdx).FlatTo1D(s); // Data - auto data_l = nd_l.data().FlatTo2D(s); - auto data_r = nd_r.data().FlatTo2D(s); + auto data_l = lhs.data().FlatTo2D(s); + auto data_r = rhs.data().FlatTo2D(s); auto out = output.data().FlatTo2D(s); // TODO(haibin) A more appropriate way: Copy to output, then apply ops @@ -134,12 +127,12 @@ void BinaryComputeEx(const nnvm::NodeAttrs& attrs, if (typeid(OP) == typeid(mshadow::op::plus)) { // If any input is dense, fallback to FCompute if (common::ContainsDefaultStorage(inputs)) { - FComputeExFallback(attrs, ctx, inputs, req, outputs, BinaryCompute); + FCompExFallback(attrs, ctx, inputs, req, outputs, BinaryCompute); return; } CHECK_EQ(inputs[0].storage_type(), kRowSparseStorage) << "Sparse type not supported yet"; CHECK_EQ(inputs[1].storage_type(), kRowSparseStorage) << "Sparse type not supported yet"; - BinaryComputeRspRsp(attrs, ctx, inputs, req, outputs); + BinaryComputeRspRsp(attrs, ctx, inputs, req, outputs); return; } else { LOG(FATAL) << "Not implemented"; @@ -175,6 +168,8 @@ void BinaryBackwardUseNoneRsp(const nnvm::NodeAttrs& attrs, using namespace mshadow::expr; Stream *s = ctx.get_stream(); CHECK_EQ(inputs[0].storage_type(), kRowSparseStorage); + CHECK_EQ(outputs[0].storage_type(), kRowSparseStorage); + CHECK_EQ(outputs[1].storage_type(), kRowSparseStorage); CHECK(typeid(LOP) == typeid(mshadow_op::identity)); CHECK(typeid(ROP) == typeid(mshadow_op::identity)); TShape shape = inputs[0].aux_shape(rowsparse::kIdx); @@ -208,6 +203,7 @@ void BinaryBackwardUseNoneEx(const nnvm::NodeAttrs& attrs, auto stype = inputs[0].storage_type(); CHECK_EQ(stype, kRowSparseStorage) << "Not implemented yet"; BinaryBackwardUseNoneRsp(attrs, ctx, inputs, req, outputs); + // TODO(haibin) fallback for kDefaultStorage } template diff --git a/src/operator/tensor/elemwise_binary_op_basic.cu b/src/operator/tensor/elemwise_binary_op_basic.cu index e2a0eb75c579..4f88aecfdf9f 100644 --- a/src/operator/tensor/elemwise_binary_op_basic.cu +++ b/src/operator/tensor/elemwise_binary_op_basic.cu @@ -9,7 +9,8 @@ namespace mxnet { namespace op { NNVM_REGISTER_OP(elemwise_add) -.set_attr("FCompute", BinaryCompute); +.set_attr("FCompute", BinaryCompute) +.set_attr(FCOMP_EX_GPU, BinaryComputeEx); NNVM_REGISTER_OP(_grad_add) .set_attr("FCompute", BinaryCompute); @@ -17,7 +18,9 @@ NNVM_REGISTER_OP(_grad_add) NNVM_REGISTER_OP(_backward_add) .set_attr("FCompute", BinaryBackwardUseNone); + mshadow_op::identity, mshadow_op::identity>) +.set_attr(FCOMP_EX_GPU, + BinaryBackwardUseNoneEx); NNVM_REGISTER_OP(_sub) .set_attr("FCompute", BinaryCompute); diff --git a/src/operator/tensor/elemwise_unary_op.cc b/src/operator/tensor/elemwise_unary_op.cc index 879af02373a9..ae9673621570 100644 --- a/src/operator/tensor/elemwise_unary_op.cc +++ b/src/operator/tensor/elemwise_unary_op.cc @@ -83,7 +83,7 @@ NNVM_REGISTER_OP(_identity_with_attr_like_rhs) .set_attr("FIgnoreInputs", [](const NodeAttrs& attrs) { return std::vector(1, 1); }) .set_attr("FCompute", IdentityCompute) -.set_attr(FCOMP_EX_CPU, IdentityComputeEx) +.set_attr(FCOMP_EX_CPU, IdentityLikeRhsComputeEx) .set_attr("FInferShape", ElemwiseShape<2, 1>) .set_attr("FInferStorageType", IdentityAttrLikeRhsStorageType) .set_attr( diff --git a/src/operator/tensor/elemwise_unary_op.cu b/src/operator/tensor/elemwise_unary_op.cu index c5a72b4e8c4f..24b8a25ecbae 100644 --- a/src/operator/tensor/elemwise_unary_op.cu +++ b/src/operator/tensor/elemwise_unary_op.cu @@ -23,7 +23,9 @@ NNVM_REGISTER_OP(make_loss) // identity output as first input, but attributes are constrainted to be like rhs NNVM_REGISTER_OP(_identity_with_attr_like_rhs) -.set_attr("FCompute", IdentityCompute); +.set_attr("FCompute", IdentityCompute) +.set_attr(FCOMP_EX_GPU, IdentityLikeRhsComputeEx); + NNVM_REGISTER_OP(Cast) .set_attr("FCompute", CastCompute); diff --git a/src/operator/tensor/elemwise_unary_op.h b/src/operator/tensor/elemwise_unary_op.h index 48aac5471bc8..2380c9d41dfd 100644 --- a/src/operator/tensor/elemwise_unary_op.h +++ b/src/operator/tensor/elemwise_unary_op.h @@ -68,32 +68,27 @@ void IdentityComputeRsp(const nnvm::NodeAttrs& attrs, using namespace mshadow; using namespace mshadow::expr; Stream *s = ctx.get_stream(); - NDArrayStorageType storage_type = inputs[1].storage_type(); - CHECK_EQ(storage_type, kRowSparseStorage); - if (req[0] == kNullOp) { - LOG(FATAL) << "kNullOp in IdentityComputeEx not supported yet"; - } - if (req[0] == kWriteInplace) { - LOG(FATAL) << "kWriteInplace for sparse storage not supported yet"; - } - TShape shape = inputs[1].aux_shape(rowsparse::kIdx); - if (shape.ndim() == 0) return; - outputs[0].CheckAndAlloc({shape}); - MSHADOW_TYPE_SWITCH(outputs[0].dtype(), DType, { - MSHADOW_TYPE_SWITCH(outputs[0].aux_type(rowsparse::kIdx), AuxType, { - auto out_d = outputs[0].data().FlatTo1D(s); - auto out_aux = outputs[0].aux_data(rowsparse::kIdx).FlatTo1D(s); - auto in_aux = inputs[1].aux_data(rowsparse::kIdx).FlatTo1D(s); + auto &input = inputs[0]; + auto &output = outputs[0]; + CHECK_NE(req[0], kNullOp) << "kNullOp in IdentityComputeEx not supported yet"; + CHECK_NE(req[0], kWriteInplace) << "kWriteInplace in IdentityComputeEx not supported yet"; + if (!input.storage_initialized()) return; + TShape shape = input.aux_shape(rowsparse::kIdx); + output.CheckAndAlloc({shape}); + MSHADOW_TYPE_SWITCH(output.dtype(), DType, { + MSHADOW_TYPE_SWITCH(output.aux_type(rowsparse::kIdx), AuxType, { + auto out_d = output.data().FlatTo1D(s); + auto out_aux = output.aux_data(rowsparse::kIdx).FlatTo1D(s); + auto in_aux = input.aux_data(rowsparse::kIdx).FlatTo1D(s); ASSIGN_DISPATCH(out_d, req[0], - F(inputs[1].data().FlatTo1D(s))); + F(input.data().FlatTo1D(s))); ASSIGN_DISPATCH(out_aux, req[0], F(in_aux)); }); }); } -// FIXME the index is hard coded for _identity_with_attr_like_rhs op template -void IdentityComputeEx(const nnvm::NodeAttrs& attrs, +void IdentityLikeRhsComputeEx(const nnvm::NodeAttrs& attrs, const OpContext& ctx, const std::vector& inputs, const std::vector& req, @@ -103,9 +98,13 @@ void IdentityComputeEx(const nnvm::NodeAttrs& attrs, CHECK_EQ(inputs.size(), 2); CHECK_EQ(outputs.size(), 1); Stream *s = ctx.get_stream(); - NDArrayStorageType stype = inputs[1].storage_type(); - CHECK_EQ(stype, kRowSparseStorage) << "Not implemented yet"; - IdentityComputeRsp(attrs, ctx, inputs, req, outputs); + size_t rhs_idx = 1; + NDArrayStorageType stype = inputs[rhs_idx].storage_type(); + if (stype == kRowSparseStorage) { + IdentityComputeRsp(attrs, ctx, inputs, req, outputs); + } else { + LOG(FATAL) << "Not implemented yet"; + } } struct CastParam : public dmlc::Parameter { @@ -199,12 +198,11 @@ struct FillRspRowIdx { * Will revisit this interface in the future. */ template -void CastStorageDnsRspImpl(const OpContext& ctx, const TBlob& dns, NDArray* rsp) { +void CastStorageDnsRspImpl(mshadow::Stream *s, const TBlob& dns, NDArray* rsp) { CHECK(rsp != nullptr); CHECK_EQ(rsp->storage_type(), kRowSparseStorage); CHECK_EQ(dns.shape_, rsp->shape()); - mshadow::Stream *s = ctx.get_stream(); rsp->CheckAndAllocAuxData(rowsparse::kIdx, mshadow::Shape1(dns.shape_[0])); MSHADOW_TYPE_SWITCH(dns.type_flag_, DType, { // data type NDARRAY_IDX_TYPE_SWITCH(rsp->aux_type(rowsparse::kIdx), RType, { // row idx type @@ -220,7 +218,10 @@ void CastStorageDnsRspImpl(const OpContext& ctx, const TBlob& dns, NDArray* rsp) for (index_t i = 0; i < num_rows; ++i) { if (row_idx[i] < static_cast(num_rows)) ++nnr; } - if (0 == nnr) return; // zero matrix + if (0 == nnr) { + rsp->SetAuxShape(rowsparse::kIdx, TShape({0})); + return; // zero matrix + } rsp->CheckAndAllocData(mshadow::Shape2(nnr, num_cols)); // TODO(junwu): single thread for compressing row_idx and copying data // from dns to rsp, might be a bottleneck. @@ -239,30 +240,47 @@ void CastStorageDnsRspImpl(const OpContext& ctx, const TBlob& dns, NDArray* rsp) }); } +// TODO(haibin) Use memcopy instead will be much faster than assigning each individual element +struct CastStorageRspDnsKernel { + template + MSHADOW_XINLINE static void Map(int i, const index_t width, const IType* idx, const DType *data, + DType* dns, const index_t invalid_rid) { + auto rid = idx[i]; + // skip invalid rows + if (rid == invalid_rid) return; + auto dns_offset = rid * width; + auto rsp_offset = i * width; + for (size_t col = 0; col < width; col++) { + dns[dns_offset + col] = data[rsp_offset + col]; + } + } +}; + + /*! * \brief This function assumes that the meomry for dns has been allocated already * since the shape is known at binding stage. */ template -void CastStorageRspDnsImpl(const OpContext& ctx, const NDArray& rsp, TBlob* dns) { +void CastStorageRspDnsImpl(mshadow::Stream *s, const NDArray& rsp, TBlob* dns) { using namespace mshadow; using namespace mshadow::expr; - Stream *s = ctx.get_stream(); CHECK_EQ(rsp.storage_type(), kRowSparseStorage); MSHADOW_TYPE_SWITCH(dns->type_flag_, DType, { - MSHADOW_TYPE_SWITCH(rsp.aux_type(rowsparse::kIdx), IType, { + NDARRAY_IDX_TYPE_SWITCH(rsp.aux_type(rowsparse::kIdx), IType, { // assign zeros mxnet_op::Kernel::Launch(s, dns->Size(), dns->dptr()); - // data() is not empty - if (rsp.storage_shape().ndim() != 0) { - // Copy over - auto in_data = rsp.data().FlatTo2D(s); - auto out_data = dns->FlatTo2D(s); - auto num_rows = rsp.aux_shape(rowsparse::kIdx)[0]; - auto in_idx = rsp.aux_data(rowsparse::kIdx).FlatTo1D(s); - for (size_t i = 0; i < num_rows; i += 1) { - mshadow::Copy(out_data[in_idx[i]], in_data[i], s); - } + if (rsp.storage_initialized()) { + // copy over row by row + auto in_idx = rsp.aux_data(rowsparse::kIdx).FlatTo1D(s).dptr_; + auto in_data = rsp.data().FlatTo2D(s).dptr_; + auto out_data = dns->FlatTo2D(s).dptr_; + auto num_rows = rsp.aux_shape(rowsparse::kIdx).Size(); + auto rsp_shape = rsp.shape(); + auto invalid_rid = rsp_shape[0]; + auto width = rsp_shape.ProdShape(1, rsp_shape.ndim()); + mxnet_op::Kernel::Launch(s, num_rows, width, in_idx, in_data, + out_data, invalid_rid); } }); }); @@ -337,13 +355,12 @@ struct FillCsrColIdxAndVals { * Will revisit this interface in the future. */ template -void CastStorageDnsCsrImpl(const OpContext& ctx, const TBlob& dns, NDArray* csr) { +void CastStorageDnsCsrImpl(mshadow::Stream *s, const TBlob& dns, NDArray* csr) { CHECK(csr != nullptr); CHECK_EQ(csr->storage_type(), kCSRStorage); CHECK_EQ(dns.shape_.ndim(), 2); CHECK_EQ(dns.shape_, csr->shape()); - mshadow::Stream *s = ctx.get_stream(); MSHADOW_TYPE_SWITCH(dns.type_flag_, DType, { // data type NDARRAY_IDX_TYPE_SWITCH(csr->aux_type(csr::kIndPtr), IType, { // indptr type NDARRAY_IDX_TYPE_SWITCH(csr->aux_type(csr::kIdx), CType, { // col idx type @@ -408,13 +425,12 @@ struct CopyCsrDataToDns { * Will revisit this interface in the future. */ template -void CastStorageCsrDnsImpl(const OpContext& ctx, const NDArray& csr, TBlob* dns) { +void CastStorageCsrDnsImpl(mshadow::Stream *s, const NDArray& csr, TBlob* dns) { CHECK(dns != nullptr); CHECK_EQ(csr.storage_type(), kCSRStorage); CHECK_EQ(dns->shape_.ndim(), 2); CHECK_EQ(dns->shape_, csr.shape()); - mshadow::Stream *s = ctx.get_stream(); MSHADOW_TYPE_SWITCH(dns->type_flag_, DType, { // data type NDARRAY_IDX_TYPE_SWITCH(csr.aux_type(csr::kIndPtr), IType, { // indptr type NDARRAY_IDX_TYPE_SWITCH(csr.aux_type(csr::kIdx), CType, { // col idx type @@ -422,6 +438,7 @@ void CastStorageCsrDnsImpl(const OpContext& ctx, const NDArray& csr, TBlob* dns) const index_t num_cols = dns->shape_[1]; DType* dns_data = dns->dptr(); mxnet_op::Kernel::Launch(s, dns->shape_.Size(), dns_data); + if (!csr.storage_initialized()) return; const IType* indptr = csr.aux_data(csr::kIndPtr).dptr(); const CType* col_idx = csr.aux_data(csr::kIdx).dptr(); const DType* csr_data = csr.data().dptr(); @@ -446,6 +463,30 @@ inline bool CastStorageInferStorageType(const nnvm::NodeAttrs& attrs, return true; } +template +void CastStorageComputeImpl(mshadow::Stream *s, + const NDArray& input, + const NDArray& output) { + using namespace mshadow; + using namespace mshadow::expr; + const auto src_stype = input.storage_type(); + const auto dst_stype = output.storage_type(); + if (src_stype == kRowSparseStorage && dst_stype == kDefaultStorage) { + TBlob ret = output.data(); + CastStorageRspDnsImpl(s, input, &ret); + } else if (src_stype == kDefaultStorage && dst_stype == kRowSparseStorage) { + NDArray ret = output; // get rid of the const qualifer + CastStorageDnsRspImpl(s, input.data(), &ret); + } else if (src_stype == kDefaultStorage && dst_stype == kCSRStorage) { + NDArray ret = output; // get rid of the const qualifer + CastStorageDnsCsrImpl(s, input.data(), &ret); + } else if (src_stype == kCSRStorage && dst_stype == kDefaultStorage) { + TBlob ret = output.data(); + CastStorageCsrDnsImpl(s, input, &ret); + } else { + LOG(FATAL) << "Not implemented"; + } +} template void CastStorageComputeEx(const nnvm::NodeAttrs& attrs, const OpContext& ctx, @@ -457,23 +498,7 @@ void CastStorageComputeEx(const nnvm::NodeAttrs& attrs, Stream *s = ctx.get_stream(); CHECK_EQ(inputs.size(), 1); CHECK_EQ(outputs.size(), 1); - const auto src_stype = inputs[0].storage_type(); - const auto dst_stype = outputs[0].storage_type(); - if (src_stype == kRowSparseStorage && dst_stype == kDefaultStorage) { - TBlob ret = outputs[0].data(); - CastStorageRspDnsImpl(ctx, inputs[0], &ret); - } else if (src_stype == kDefaultStorage && dst_stype == kRowSparseStorage) { - NDArray ret = outputs[0]; // get rid of the const qualifer - CastStorageDnsRspImpl(ctx, inputs[0].data(), &ret); - } else if (src_stype == kDefaultStorage && dst_stype == kCSRStorage) { - NDArray ret = outputs[0]; // get rid of the const qualifer - CastStorageDnsCsrImpl(ctx, inputs[0].data(), &ret); - } else if (src_stype == kCSRStorage && dst_stype == kDefaultStorage) { - TBlob ret = outputs[0].data(); - CastStorageCsrDnsImpl(ctx, inputs[0], &ret); - } else { - LOG(FATAL) << "Not implemented"; - } + CastStorageComputeImpl(s, inputs[0], outputs[0]); } #define MXNET_OPERATOR_REGISTER_UNARY(name) \ diff --git a/src/operator/tensor/indexing_op.cc b/src/operator/tensor/indexing_op.cc index f9023054a10f..fed4b4dd229b 100644 --- a/src/operator/tensor/indexing_op.cc +++ b/src/operator/tensor/indexing_op.cc @@ -86,6 +86,40 @@ NNVM_REGISTER_OP(_backward_Embedding) .set_attr("TIsBackward", true) .set_attr("FCompute", EmbeddingOpBackward); +NNVM_REGISTER_OP(SparseEmbedding) +.describe(R"code(Maps integer indices to vector representations (embeddings) with sparse weight update +)code" ADD_FILELINE) +.set_num_inputs(2) +.set_num_outputs(1) +.set_attr_parser(ParamParser) +.set_attr("FListInputNames", + [](const NodeAttrs& attrs) { + return std::vector{"data", "weight"}; + }) +.set_attr("FInferShape", EmbeddingOpShape) +.set_attr("FInferType", EmbeddingOpType) +.set_attr("FResourceRequest", + [](const NodeAttrs& attrs) { + return std::vector{ResourceRequest::kTempSpace}; + }) +.set_attr("FCompute", EmbeddingOpForward) +.set_attr("FGradient", + [](const nnvm::NodePtr& n, const std::vector& ograds) { + return MakeNonlossGradNode("_backward_SparseEmbedding", n, ograds, + {n->inputs[0]}, n->attrs.dict); + }) +.add_argument("data", "NDArray-or-Symbol", "The input array to the embedding operator.") +.add_argument("weight", "NDArray-or-Symbol", "The embedding weight matrix.") +.add_arguments(EmbeddingParam::__FIELDS__()); + +NNVM_REGISTER_OP(_backward_SparseEmbedding) +.set_num_inputs(2) +.set_num_outputs(2) +.set_attr("TIsBackward", true) +.set_attr("FInferStorageType", SparseEmbeddingBackwardStorageType) +.set_attr("FComputeEx", SparseEmbeddingOpBackwardEx); +// TODO(haibin) handle dense case +// .set_attr("FCompute", EmbeddingOpBackward); NNVM_REGISTER_OP(take) .describe(R"code(Takes elements from an input array along the given axis. diff --git a/src/operator/tensor/indexing_op.h b/src/operator/tensor/indexing_op.h index 041ceab8a679..944adb4ace8d 100755 --- a/src/operator/tensor/indexing_op.h +++ b/src/operator/tensor/indexing_op.h @@ -315,6 +315,133 @@ void EmbeddingOpBackward(const nnvm::NodeAttrs& attrs, }); } +// todo template req +struct SparseEmbeddingBackwardRsp { + template + // size_t? + // i for each thread + // each thread is responsible for rows in output in [segment_start, segment_end) + MSHADOW_XINLINE static void Map(int i, const size_t width, IType* dst_idx, DType* dst_val, + const IType* idx, const size_t num_idx, const DType* src, + const size_t segment_len, const size_t num_rows, OpReqType req) { + size_t segment_start = i * segment_len; + size_t segment_end = (i + 1) * segment_len; + for (size_t y = 0; y < num_idx; y++) { + size_t j = idx[y]; + if (j > num_rows) j = num_rows - 1; + if (j < segment_start || j >= segment_end) continue; + dst_idx[j] = j; + for (size_t k = 0; k < width; k++) { + if (req == kWriteTo) { req = kAddTo;} + KERNEL_ASSIGN(dst_val[j * width + k], req, src[y * width + k]); + } + } + } +}; + +inline bool SparseEmbeddingBackwardStorageType(const nnvm::NodeAttrs& attrs, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK_EQ((*in_attrs)[0], kDefaultStorage); + CHECK_EQ((*in_attrs)[1], kDefaultStorage); + (*out_attrs)[0] = kRowSparseStorage; + return true; +} + +// todo replace xpu with cpu +template +void SparseEmbeddingOpBackwardDnsDnsRsp(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + using namespace mshadow; + using namespace mxnet_op; + using namespace mshadow::expr; + CHECK_EQ(inputs.size(), 2U); + CHECK_EQ(outputs.size(), 2U); + // CHECK_EQ(req[embedding::kData], kNullOp) + // << "Embedding layer doesn't support calculate data gradient" << req[0] << " " << req[1]; + // idx shape (d1, d2 .. dk) + auto idx = inputs[1]; + // grad shape (d1, d2, .. dk, out_dim) + auto grad = inputs[0]; + // weight shape (in_dim, out_dim) + auto output = outputs[1]; + CHECK_EQ(idx.storage_type(), kDefaultStorage); + CHECK_EQ(grad.storage_type(), kDefaultStorage); + CHECK_EQ(output.dtype(), grad.dtype()); + CHECK_EQ(idx.dtype(), output.aux_type(rowsparse::kIdx)) << "Index type doesn't match"; + + const TShape& ishape = idx.shape(); + const TShape& oshape = grad.shape(); + + Stream *s = ctx.get_stream(); + CHECK_EQ(idx.dtype(), output.aux_type(rowsparse::kIdx)) + << "embedding input index and gradient row sparse type doesn't match!"; + // Alloc dense + unsigned int num_rows = output.shape()[0]; + output.CheckAndAlloc({TShape({num_rows})}); + MSHADOW_TYPE_SWITCH(output.dtype(), DType, { + MSHADOW_TYPE_SWITCH(idx.dtype(), IType, { + // Assuming aux_type == IType for now + // idx_data shape (d1 * d2 * .. dk) + // input embedding indice (d1 * d2 * .. dk), each idx in [0, input_dim) + auto idx_data = idx.data().FlatTo1D(s); + // grad_data shape (d1 * d2 * .. dk, out_dim) + auto grad_data = grad.data().get_with_shape( + Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), s); + // output shape (in_dim, out_dim) + auto output_idx = output.aux_data(rowsparse::kIdx).FlatTo1D(s); + auto output_val = output.data().FlatTo2D(s); + int num_threads = 64; + // dim_1 .. dim_k + size_t width = output.shape()[1]; + size_t segment_len = (num_rows + num_threads - 1) / num_threads; + // TODO(refactor me) fill with invalid values + for (size_t i = 0; i < num_rows; i++) { + output_idx.dptr_[i] = num_rows; + } + // fill 0 + for (size_t i = 0; i < output_val.shape_.Size(); i++) { + output_val.dptr_[i] = 0; + } + Kernel::Launch(s, num_threads, width, output_idx.dptr_, + output_val.dptr_, idx_data.dptr_, + ishape.Size(), grad_data.dptr_, segment_len, + num_rows, req[1]); + }); + }); +} + +// todo replace xpu with cpu +template +void SparseEmbeddingOpBackwardEx(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + using namespace mshadow; + using namespace mxnet_op; + using namespace mshadow::expr; + CHECK_EQ(inputs.size(), 2U); + CHECK_EQ(outputs.size(), 2U); + // CHECK_EQ(req[embedding::kData], kNullOp) + // << "Embedding layer doesn't support calculate data gradient" << req[0] << " " << req[1]; + // idx shape (d1, d2 .. dk) + auto idx_stype = inputs[1].storage_type(); + // grad shape (d1, d2, .. dk, out_dim) + auto grad_stype = inputs[0].storage_type(); + // weight shape (in_dim, out_dim) + auto output_stype = outputs[1].storage_type(); + if (idx_stype == kDefaultStorage && grad_stype == kDefaultStorage && + output_stype == kRowSparseStorage) { + SparseEmbeddingOpBackwardDnsDnsRsp(attrs, ctx, inputs, req, outputs); + } else { + LOG(FATAL) << "Not implemented"; + } +} + namespace take_ { // to avoid name conflict enum TakeOpInputs {kArr, kIdx}; enum TakeOpOutputs {kOut}; diff --git a/src/operator/tensor/init_op.cc b/src/operator/tensor/init_op.cc index b091cbca2d9f..ddee0c7a46f5 100644 --- a/src/operator/tensor/init_op.cc +++ b/src/operator/tensor/init_op.cc @@ -21,7 +21,7 @@ NNVM_REGISTER_OP(_zeros) .set_attr("FInferShape", InitShape) .set_attr("FInferType", InitType) .set_attr("FCompute", FillCompute) -.set_attr(FCOMP_EX_CPU, FillComputeEx) +.set_attr(FCOMP_EX_CPU, FillComputeZerosEx) .add_arguments(InitOpParam::__FIELDS__()); NNVM_REGISTER_OP(_ones) diff --git a/src/operator/tensor/init_op.cu b/src/operator/tensor/init_op.cu index a798f26db60d..bcb10f70b3c3 100644 --- a/src/operator/tensor/init_op.cu +++ b/src/operator/tensor/init_op.cu @@ -9,7 +9,8 @@ namespace mxnet { namespace op { NNVM_REGISTER_OP(_zeros) -.set_attr("FCompute", FillCompute); +.set_attr("FCompute", FillCompute) +.set_attr(FCOMP_EX_GPU, FillComputeZerosEx); NNVM_REGISTER_OP(_ones) .set_attr("FCompute", FillCompute); diff --git a/src/operator/tensor/init_op.h b/src/operator/tensor/init_op.h index 1c96c1f2cf5f..2339282a6740 100644 --- a/src/operator/tensor/init_op.h +++ b/src/operator/tensor/init_op.h @@ -15,6 +15,8 @@ #include #include #include "../elemwise_op_common.h" +#include "../mxnet_op.h" + namespace mxnet { namespace op { @@ -126,8 +128,31 @@ void FillCompute(const nnvm::NodeAttrs& attrs, }); } -template -void FillComputeEx(const nnvm::NodeAttrs& attrs, +// Fill a rsp NDArray with zeros by updating the aux shape. +template +void FillZerosRspImpl(mshadow::Stream *s, NDArray *dst) { + if (!dst->storage_initialized()) return; + // reset the shapes if it's not zeros + auto storage_shape = dst->storage_shape(); + storage_shape[0] = 0; + dst->SetAuxShape(rowsparse::kIdx, TShape({0})); + dst->SetStorageShape(storage_shape); +} + +// Fill a CSR NDArray with zeros by updating the aux shape. +template +void FillZerosCsrImpl(mshadow::Stream *s, NDArray *dst) { + if (!dst->storage_initialized()) return; + // reset the shapes if it's not zeros + TShape new_shape({0}); + dst->SetAuxShape(csr::kIndPtr, new_shape); + dst->SetAuxShape(csr::kIdx, new_shape); + dst->SetStorageShape(new_shape); +} + +// This operator never needs to fall back, since there's no input NDArray +template +void FillComputeZerosEx(const nnvm::NodeAttrs& attrs, const OpContext& ctx, const std::vector& inputs, const std::vector& req, @@ -138,7 +163,15 @@ void FillComputeEx(const nnvm::NodeAttrs& attrs, CHECK_EQ(outputs.size(), 1); CHECK_EQ(inputs.size(), 0); auto stype = outputs[0].storage_type(); - CHECK_EQ(value, 0) << "Not implemented yet"; + if (stype == kRowSparseStorage) { + NDArray nd(outputs[0]); + FillZerosRspImpl(s, &nd); + } else if (stype == kCSRStorage) { + NDArray nd(outputs[0]); + FillZerosCsrImpl(s, &nd); + } else { + LOG(FATAL) << "storage type not implemented."; + } } template diff --git a/src/operator/tensor/matrix_op-inl.h b/src/operator/tensor/matrix_op-inl.h index 0b53f11316b0..85adf85f0904 100644 --- a/src/operator/tensor/matrix_op-inl.h +++ b/src/operator/tensor/matrix_op-inl.h @@ -619,11 +619,13 @@ void DotCsrDnsDnsImpl(const OpContext& ctx, s, data_out.Size(), data_out.dptr()); } if (trans_lhs) { + if (!lhs.storage_initialized()) return; mxnet_op::Kernel, xpu>::Launch(s, data_out.Size(), data_out.dptr(), data_l.dptr(), indptr_l.dptr(), col_idx_l.dptr(), data_r.dptr(), lhs.shape()[0], rhs.shape()[1]); } else { + if (!lhs.storage_initialized()) return; mxnet_op::Kernel, xpu>::Launch(s, data_out.Size(), data_out.dptr(), data_l.dptr(), indptr_l.dptr(), col_idx_l.dptr(), data_r.dptr(), rhs.shape()[1]); @@ -767,14 +769,15 @@ void DotCsrDnsRspImpl(const OpContext& ctx, mxnet_op::Kernel::Launch( s, out_tmp.shape_.Size(), out_tmp.dptr_); } - // generate a temporary dns output - mxnet_op::Kernel, xpu>::Launch( - s, out_tmp.shape_.Size(), out_tmp.dptr_, data_l.dptr(), - indptr_l.dptr(), col_idx_l.dptr(), data_r.dptr(), - lhs.shape()[0], out_tmp.shape_[1]); - + if (lhs.storage_initialized()) { + // generate a temporary dns output + mxnet_op::Kernel, xpu>::Launch( + s, out_tmp.shape_.Size(), out_tmp.dptr_, data_l.dptr(), + indptr_l.dptr(), col_idx_l.dptr(), data_r.dptr(), + lhs.shape()[0], out_tmp.shape_[1]); + } // cast dns to rsp - CastStorageDnsRspImpl(ctx, TBlob(out_tmp), ret); + CastStorageDnsRspImpl(s, TBlob(out_tmp), ret); } else { // TODO(junwu): check whether the following code is a bottleneck // allocate output NDArray (single thread) @@ -859,6 +862,17 @@ void DotBackwardCsrDnsRsp(const nnvm::NodeAttrs& attrs, DotCsrDnsRspImpl(ctx, inputs[1], inputs[0], req[1], !param.transpose_a, &ret); } +template +void DotBackwardCsrDnsDns(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const DotParam& param = nnvm::get(attrs.parsed); + NDArray ret = outputs[1]; + DotCsrDnsDnsImpl(ctx, inputs[1], inputs[0], req[1], !param.transpose_a, &ret); +} + template void DotBackwardEx(const nnvm::NodeAttrs& attrs, const OpContext& ctx, @@ -875,12 +889,18 @@ void DotBackwardEx(const nnvm::NodeAttrs& attrs, // TODO(junwu): check whether this CHECK is reasonable const DotParam& param = nnvm::get(attrs.parsed); CHECK(!param.transpose_b) << "sparse dot only supports dot(A, X) and dot(A.T(), X)"; - if (inputs[0].storage_type() == kDefaultStorage // ograd dns format + // dns, csr, dns => *, rsp && inputs[1].storage_type() == kCSRStorage // csr input lhs of the op && inputs[2].storage_type() == kDefaultStorage // dns input rhs of the op && outputs[1].storage_type() == kRowSparseStorage) { // grad(rhs) rsp format DotBackwardCsrDnsRsp(attrs, ctx, inputs, req, outputs); + } else if (inputs[0].storage_type() == kDefaultStorage // ograd dns format + // dns, csr, dns => *, dns + && inputs[1].storage_type() == kCSRStorage // csr input lhs of the op + && inputs[2].storage_type() == kDefaultStorage // dns input rhs of the op + && outputs[1].storage_type() == kDefaultStorage) { // grad(rhs) dns format + DotBackwardCsrDnsDns(attrs, ctx, inputs, req, outputs); } else { // TODO(junwu): add fallback mechanism LOG(FATAL) << "Not supported"; diff --git a/src/operator/tensor/matrix_op.cc b/src/operator/tensor/matrix_op.cc index 4d8e30b46dc7..b3829f80c559 100644 --- a/src/operator/tensor/matrix_op.cc +++ b/src/operator/tensor/matrix_op.cc @@ -382,6 +382,10 @@ NNVM_REGISTER_OP(_backward_dot) .set_attr_parser(ParamParser) .set_attr("TIsBackward", true) .set_attr("FInferStorageType", DotBackwardInferStorageType) +.set_attr("FResourceRequest", + [](const NodeAttrs& attrs) { + return std::vector{ResourceRequest::kTempSpace}; + }) .set_attr("FCompute", DotBackward_) .set_attr("FComputeEx", DotBackwardEx) .add_arguments(DotParam::__FIELDS__()); diff --git a/src/operator/tensor/matrix_op.cu b/src/operator/tensor/matrix_op.cu index d10b0ccc46c2..2e1effb9e560 100644 --- a/src/operator/tensor/matrix_op.cu +++ b/src/operator/tensor/matrix_op.cu @@ -44,7 +44,9 @@ NNVM_REGISTER_OP(dot) .set_attr("FComputeEx", DotForwardEx); NNVM_REGISTER_OP(_backward_dot) -.set_attr("FCompute", DotBackward_); +.set_attr("FCompute", DotBackward_) +.set_attr("FComputeEx", DotBackwardEx); + NNVM_REGISTER_OP(batch_dot) .set_attr("FCompute", BatchDotForward_); diff --git a/tests/cpp/ndarray_test.cc b/tests/cpp/ndarray_test.cc index 7954eaa32ffc..e76ea1f42dfe 100644 --- a/tests/cpp/ndarray_test.cc +++ b/tests/cpp/ndarray_test.cc @@ -9,174 +9,43 @@ #include "../src/executor/graph_executor.h" #include "../src/operator/tensor/elemwise_binary_op.h" #include "../src/operator/tensor/elemwise_unary_op.h" +#include "../src/operator/tensor/indexing_op.h" #include "../src/operator/optimizer_op-inl.h" +#include "../src/operator/tensor/init_op.h" +#include "test_utils.h" -#define TEST_DTYPE float -#define TEST_ITYPE int32_t using namespace mxnet; -// TODO(haibin) these functions should be put in test_util.h -void CheckDataRegion(const TBlob &src, const TBlob &dst) { - auto size = src.shape_.Size() * mshadow::mshadow_sizeof(src.type_flag_); - auto equals = memcmp(src.dptr_, dst.dptr_, size); - EXPECT_EQ(equals, 0); -} - -NDArray GetIndexND(const TShape shape, const Context ctx, const std::vector &values) { - NDArray nd(shape, ctx, false, ROW_SPARSE_IDX_TYPE); - size_t num_val = values.size(); - MSHADOW_TYPE_SWITCH(nd.dtype(), DType, { - auto tensor = nd.data().FlatTo1D(); - for (size_t i = 0; i < num_val; i++) { - tensor[i] = values[i]; - } - }); - return nd; -} - -NDArray GetDenseND(const TShape shape, const Context ctx, const std::vector &values) { - NDArray nd(shape, ctx, false); - size_t num_val = values.size(); - CHECK_EQ(num_val, nd.shape().ProdShape(0, nd.shape().ndim())); - MSHADOW_TYPE_SWITCH(nd.dtype(), DType, { - auto tensor = nd.data().FlatTo1D(); - for (size_t i = 0; i < num_val; i++) { - tensor[i] = values[i]; - } - }); - return nd; -} - -NDArray GetRspND(const TShape shape, const Context ctx, const std::vector idx, - const std::vector vals) { - index_t num_rows = idx.size(); - index_t num_cols = vals.size() / idx.size(); - NDArray index = GetIndexND(TShape({num_rows}), ctx, idx); - CHECK_EQ(vals.size() % idx.size(), 0); - NDArray raw_data = GetDenseND(TShape({num_rows, num_cols}), ctx, vals); - NDArray nd(raw_data, {index}, ctx, kRowSparseStorage, shape); - return nd; -} - -NDArray Convert(NDArrayStorageType type, NDArray src) { - CHECK_EQ(type, kDefaultStorage); - NDArray converted(src.shape(), src.ctx(), false); - Engine::Get()->PushSync([src, converted](RunContext ctx) { - // TODO provide type in attrs, which is empty now - OpContext op_ctx; - op_ctx.run_ctx = ctx; - if (src.storage_type() == kRowSparseStorage) { - std::vector inputs({src}), outputs({converted}); - op::CastStorageComputeEx({}, op_ctx, inputs, {}, outputs); - } else if (src.storage_type() == kDefaultStorage) { - std::vector inputs({src.data()}), outputs({converted.data()}); - op::IdentityCompute({}, op_ctx, inputs, {kWriteTo}, outputs); - } else { - LOG(FATAL) << "unsupported storage type"; - } - }, src.ctx(), {src.var()}, {converted.var()}, - FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME); - converted.WaitToRead(); - return converted; -} - -// Operators -void BinaryDenseSparseTest() { - Context ctx = Context::CPU(); - - TShape output_shape({3, 2}); - NDArray input_nd0 = GetRspND(output_shape, ctx, {0, 1}, {10, 10, 10, 10}); - NDArray input_nd1 = GetDenseND(output_shape, ctx, {1, 2, 3, 4, 5, 6}); - NDArray output(kRowSparseStorage, output_shape, ctx); - - std::vector const_vars; - const_vars.push_back(input_nd0.var()); - const_vars.push_back(input_nd1.var()); - Engine::Get()->PushSync([input_nd0, input_nd1, output](RunContext ctx) { - nnvm::NodeAttrs attrs; - OpContext op_ctx; - std::vector inputs, outputs; - std::vector req; - inputs.push_back(input_nd0); - inputs.push_back(input_nd1); - outputs.push_back(output); - op::BinaryComputeEx(attrs, op_ctx, inputs, req, outputs); - }, input_nd0.ctx(), const_vars, {output.var()}, - FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME); - std::vector output_vals({11, 12, 3, 4, 15, 16}); - NDArray out_data = GetDenseND(output_shape, ctx, output_vals); - Engine::Get()->WaitForAll(); - CheckDataRegion(out_data.data(), output.data()); - // TODO(haibin) also check with zeros.. -} - -void BinaryRsRsTest() { - Context ctx = Context::CPU(); - - TShape index_shape({2}); - NDArray index0 = GetIndexND(index_shape, ctx, {0, 1}); - NDArray index1 = GetIndexND(index_shape, ctx, {0, 2}); - - TShape data_shape({2, 2}); - NDArray raw_data0 = GetDenseND(data_shape, ctx, {10, 10, 10, 10}); - NDArray raw_data1 = GetDenseND(data_shape, ctx, {5, 5, 5, 5}); - - NDArray input_nd0(raw_data0, {index0}, ctx, kRowSparseStorage, data_shape); - NDArray input_nd1(raw_data1, {index1}, ctx, kRowSparseStorage, data_shape); - - TShape output_shape({4, 2}); - NDArray output(kRowSparseStorage, output_shape, ctx); - std::vector const_vars; - const_vars.push_back(input_nd0.var()); - const_vars.push_back(input_nd1.var()); - - Engine::Get()->PushSync([input_nd0, input_nd1, output](RunContext ctx) { - OpContext op_ctx; - std::vector inputs, outputs; - std::vector req; - inputs.push_back(input_nd0); - inputs.push_back(input_nd1); - outputs.push_back(output); - op::BinaryComputeRspRsp({}, op_ctx, inputs, req, outputs); - }, input_nd0.ctx(), const_vars, {output.var()}, - FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME); - - - // Check the data region of output ndarray - NDArray dense_output = GetDenseND(output_shape, ctx, {15, 15, 10, 10, 5, 5, 0, 0}); - NDArray copy = Convert(kDefaultStorage, output); - CheckDataRegion(input_nd0.data(), raw_data0.data()); - CheckDataRegion(input_nd1.data(), raw_data1.data()); - CheckDataRegion(dense_output.data(), copy.data()); -} - -// Conversion -void DenseToDenseConversionTest() { +// Conversion Tests +void CastDnsDnsTest() { Context ctx; TShape shape({2, 2}); - NDArray nd = GetDenseND(shape, ctx, {1, 2, 3, 10}); + NDArray nd = DnsND(shape, ctx, {}); auto nd_copy = Convert(kDefaultStorage, nd); CheckDataRegion(nd_copy.data(), nd.data()); } -void SparseToDenseConversionTest() { +void CastRspDnsTest() { Context ctx; // Sparse ndarray TShape shape({2, 2}); - NDArray nd = GetRspND(shape, ctx, {0}, {1, 1}); + float v1 = RandFloat(); + float v2 = RandFloat(); + NDArray nd = RspND(shape, ctx, {0}, {v1, v2}); // Dense ndarray - NDArray dense_nd = GetDenseND(shape, ctx, {1, 1, 0, 0}); + NDArray dense_nd = DnsND(shape, ctx, {v1, v2, 0, 0}); NDArray converted = Convert(kDefaultStorage, nd); CheckDataRegion(converted.data(), dense_nd.data()); } -// NDArray Function +// NDArray function tests void SetValueTest() { Context ctx = Context::CPU(); TShape data_shape({2, 2}); - NDArray nd0 = GetDenseND(data_shape, ctx, {10, 10, 10, 10}); + float v = RandFloat(); + NDArray nd0 = DnsND(data_shape, ctx, {v, v, v, v}); NDArray nd1(data_shape, ctx, false); - nd1 = 10; + nd1 = v; nd1.WaitToRead(); CheckDataRegion(nd0.data(), nd1.data()); } @@ -184,28 +53,35 @@ void SetValueTest() { // InferStorage void InferElemwiseStorageTest() { nnvm::NodeAttrs attrs; - attrs.name = "Test op"; + attrs.name = "test_op"; std::vector in_attrs({kRowSparseStorage, kDefaultStorage}); std::vector out_attrs({kUndefinedStorage}); - + // rsp, default -> default op::ElemwiseStorageType<2, 1>(attrs, &in_attrs, &out_attrs); EXPECT_EQ(out_attrs[0], kDefaultStorage); + // default, rsp -> default in_attrs = {kDefaultStorage, kRowSparseStorage}; out_attrs = {kUndefinedStorage}; op::ElemwiseStorageType<2, 1>(attrs, &in_attrs, &out_attrs); EXPECT_EQ(out_attrs[0], kDefaultStorage); + // rsp, rsp -> rsp + in_attrs = {kRowSparseStorage}; + out_attrs = {kUndefinedStorage, kUndefinedStorage}; + op::ElemwiseStorageType<1, 2>(attrs, &in_attrs, &out_attrs); + EXPECT_EQ(out_attrs[0], kRowSparseStorage); + EXPECT_EQ(out_attrs[1], kRowSparseStorage); } // Optimizer void SGDDnsRspTest() { TShape shape({4, 2}); Context ctx = Context::CPU(); - NDArray weight = GetDenseND(shape, ctx, {1, 2, 3, 4, 5, 6, 7, 8}); - NDArray rsp_grad = GetRspND(shape, ctx, {0, 3}, {1, 2, 3, 4}); + NDArray weight = DnsND(shape, ctx, {1, 2, 3, 4, 5, 6, 7, 8}); + NDArray rsp_grad = RspND(shape, ctx, {0, 3}, {1, 2, 3, 4}); NDArray output = weight; - float lr = 0.1; - float wd = 0.95; - float rescale = 2; + float lr = RandFloat(); + float wd = RandFloat(); + float rescale = RandFloat(); op::SGDParam param; param.lr = lr; param.wd = wd; @@ -214,37 +90,157 @@ void SGDDnsRspTest() { Engine::Get()->PushSync([weight, rsp_grad, output, param](RunContext ctx) { std::vector inputs{weight, rsp_grad}, outputs{output}; std::vector req({kAddTo}); - op::SGDUpdateDnsRspImpl(param, {}, inputs, req, outputs); + op::SparseSGDUpdateDnsRspImpl(param, {}, inputs, req, outputs); }, weight.ctx(), {rsp_grad.var()}, {output.var()}, FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME); auto sgd = [lr, wd, rescale] (TEST_DTYPE weight, TEST_DTYPE grad) { return (1.f-lr*wd)*weight - (lr*rescale)*grad; }; - NDArray expected = GetDenseND(shape, ctx, - {1 + sgd(1, 1), 2 + sgd(2, 2), 3, 4, 5, 6, - 7 + sgd(7, 3), 8 + sgd(8, 4)}); + NDArray expected = DnsND(shape, ctx, + {1 + sgd(1, 1), 2 + sgd(2, 2), 3, 4, 5, 6, + 7 + sgd(7, 3), 8 + sgd(8, 4)}); output.WaitToRead(); CheckDataRegion(output.data(), expected.data()); } +void CopyFromToRspDnsTest() { + Context ctx; + // Sparse ndarray + TShape shape({2, 2}); + NDArray nd = RspND(shape, ctx, {0}, {1, 1}); + // Dense ndarray + NDArray dns_nd = DnsND(shape, ctx, {}); + CopyFromTo(nd, &dns_nd); + dns_nd.WaitToRead(); + CheckDataRegion(nd.data(), dns_nd.data()); +} + +void CopyFromToRspRspReuseTest() { + Context ctx; + // Sparse ndarray + TShape shape({3, 2}); + NDArray nd = RspND(shape, ctx, {0}, {1,2}); + // Sparse ndarray with enough memory. It's expected to reuse the memory + NDArray dst_nd = RspND(shape, ctx, {0, 1, 2}, {6,6,6,6,6,6}); + nd.WaitToRead(); + CopyFromTo(nd, &dst_nd); + dst_nd.WaitToRead(); + CheckDataRegion(nd.data(), dst_nd.data()); + CHECK_EQ(dst_nd.aux_shape(rowsparse::kIdx)[0], 1); + CHECK_EQ(dst_nd.storage_shape()[0], 1); + CHECK_EQ(dst_nd.storage_shape()[1], 2); +} + + +void CopyFromToRspRspFreeTest() { + Context ctx; + // Sparse ndarray + TShape shape({3, 2}); + NDArray nd = RspND(shape, ctx, {0, 1}, {1,1,1,1}); + // Sparse ndarray with enough memory. It's expected to reuse the memory + NDArray dst_nd = RspND(shape, ctx, {0}, {2,2}); + nd.WaitToRead(); + CopyFromTo(nd, &dst_nd); + dst_nd.WaitToRead(); + CheckDataRegion(nd.data(), dst_nd.data()); +} + +void BinaryAddRspRsp() { + Context ctx = Context::CPU(); + + TShape output_shape({4, 2}); + NDArray input_nd0 = RspND(output_shape, ctx, {0, 1}, {10,10,10,10}); + NDArray input_nd1 = RspND(output_shape, ctx, {0, 2}, {5,5,5,5}); + + NDArray output(kRowSparseStorage, output_shape, ctx); + std::vector const_vars; + const_vars.push_back(input_nd0.var()); + const_vars.push_back(input_nd1.var()); + + Engine::Get()->PushSync([input_nd0, input_nd1, output](RunContext ctx) { + OpContext op_ctx; + std::vector inputs, outputs; + std::vector req; + inputs.push_back(input_nd0); + inputs.push_back(input_nd1); + outputs.push_back(output); + op::BinaryComputeRspRsp({}, op_ctx, inputs, req, outputs); + }, input_nd0.ctx(), const_vars, {output.var()}, + FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME); + + // Check the data region of output ndarray + NDArray dense_output = DnsND(output_shape, ctx, {15, 15, 10, 10, 5, 5, 0, 0}); + NDArray copy = Convert(kDefaultStorage, output); + CheckDataRegion(dense_output.data(), copy.data()); +} + +void SparseEmbeddingBackwardTest() { + Context ctx = Context::CPU(); + // d1 .. dk + // idx shape : (2, 3) + // input dim 4, output dim 2 + int input_dim = 4; + int output_dim = 2; + TShape idx_shape({2, 3}); + NDArray idx = RspIdxND(idx_shape, ctx, {1, 2, 3, 1, 2, 3}); + TShape grad_shape({2, 3, 2}); + NDArray grad = DnsND(grad_shape, ctx, {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2}); + TShape out_shape({4, 2}); + NDArray output = NDArray(kRowSparseStorage, out_shape, ctx); + op::EmbeddingParam param; + param.input_dim = input_dim; + param.output_dim = output_dim; + param.dtype = 0; + + Engine::Get()->PushSync([idx, grad, output, param](RunContext ctx) { + std::vector inputs{grad, idx}, outputs{output, output}; + // this is a hack + std::vector req({kNullOp, kAddTo}); + op::SparseEmbeddingOpBackwardEx({}, {}, inputs, req, outputs); + }, output.ctx(), {grad.var(), idx.var()}, {output.var()}, + FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME); + + NDArray expected = DnsND(out_shape, ctx, {0,0,0,0,0,0,0,0}); + Engine::Get()->PushSync([idx, grad, expected, param](RunContext ctx) { + std::vector inputs{grad.data(), idx.data()}, outputs{expected.data(), expected.data()}; + std::vector req({kNullOp, kWriteTo}); + op::EmbeddingOpBackward({}, {}, inputs, req, outputs); + }, expected.ctx(), {grad.var(), idx.var()}, {expected.var()}, + FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME); + NDArray converted = Convert(kDefaultStorage, output); + expected.WaitToRead(); + CheckDataRegion(converted.data(), expected.data()); +} + + +TEST(NDArray, binary_add) { + BinaryAddRspRsp(); +} + TEST(NDArray, conversion) { - DenseToDenseConversionTest(); - SparseToDenseConversionTest(); + CastDnsDnsTest(); + CastRspDnsTest(); } TEST(NDArray, functions) { SetValueTest(); } -TEST(NDArray, basics) { - BinaryRsRsTest(); - //Wait for all operations to finish - Engine::Get()->WaitForAll(); - InferElemwiseStorageTest(); -} - TEST(NDArray, optimizer) { SGDDnsRspTest(); } +TEST(NDArray, copy) { + CopyFromToRspDnsTest(); + CopyFromToRspRspReuseTest(); + CopyFromToRspRspFreeTest(); +} + +TEST(NDArray, infer_storage) { + InferElemwiseStorageTest(); +} + +TEST(NDArray, sparse_embedding) { + SparseEmbeddingBackwardTest(); +} diff --git a/tests/cpp/test_utils.h b/tests/cpp/test_utils.h new file mode 100644 index 000000000000..45a7ba072934 --- /dev/null +++ b/tests/cpp/test_utils.h @@ -0,0 +1,105 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../src/operator/tensor/elemwise_binary_op.h" +#include "../src/operator/tensor/elemwise_unary_op.h" +#include "../src/operator/optimizer_op-inl.h" +#include "../src/operator/tensor/init_op.h" + +using namespace mxnet; +#define TEST_DTYPE float +#define TEST_ITYPE int32_t + +void CheckDataRegion(const TBlob &src, const TBlob &dst) { + auto size = src.shape_.Size() * mshadow::mshadow_sizeof(src.type_flag_); + auto equals = memcmp(src.dptr_, dst.dptr_, size); + EXPECT_EQ(equals, 0); +} + +float RandFloat() { + float v = rand() * 1.0 / RAND_MAX; + return v; +} + +// Get an NDArray with provided indices, prepared for a RowSparse NDArray. +NDArray RspIdxND(const TShape shape, const Context ctx, const std::vector &values) { + NDArray nd(shape, ctx, false, ROW_SPARSE_IDX_TYPE); + size_t num_val = values.size(); + MSHADOW_TYPE_SWITCH(nd.dtype(), DType, { + auto tensor = nd.data().FlatTo1D(); + for (size_t i = 0; i < num_val; i++) { + tensor[i] = values[i]; + } + }); + return nd; +} + +// Get a dense NDArray with provided values. +NDArray DnsND(const TShape shape, const Context ctx, std::vector vs) { + NDArray nd(shape, ctx, false); + size_t num_val = shape.Size(); + // generate random values + while (vs.size() < num_val) { + auto v = RandFloat(); + vs.push_back(v); + } + CHECK_EQ(vs.size(), nd.shape().Size()); + MSHADOW_TYPE_SWITCH(nd.dtype(), DType, { + auto tensor = nd.data().FlatTo1D(); + for (size_t i = 0; i < num_val; i++) { + tensor[i] = vs[i]; + } + }); + return nd; +} + +// Get a RowSparse NDArray with provided indices and values +NDArray RspND(const TShape shape, const Context ctx, const std::vector idx, + std::vector vals) { + CHECK(shape.ndim() <= 2) << "High dimensional row sparse not implemented yet"; + index_t num_rows = idx.size(); + index_t num_cols = vals.size() / idx.size(); + // create index NDArray + NDArray index = RspIdxND(TShape({num_rows}), ctx, idx); + CHECK_EQ(vals.size() % idx.size(), 0); + // create value NDArray + NDArray data = DnsND(TShape({num_rows, num_cols}), ctx, vals); + // create result nd + NDArray nd(kRowSparseStorage, shape, ctx, false, mshadow::default_type_flag, + {}, {TShape({num_rows})}); + // assign values + NDArray nd_aux = nd.AuxNDArray(0); + NDArray nd_data = nd.DataNDArray(); + CopyFromTo(index, &nd_aux); + CopyFromTo(data, &nd_data); + return nd; +} + +// TODO(haibin) support other types +NDArray Convert(NDArrayStorageType type, NDArray src) { + CHECK_EQ(type, kDefaultStorage); + NDArray converted(src.shape(), src.ctx(), false); + Engine::Get()->PushSync([src, converted](RunContext ctx) { + // TODO provide type in attrs, which is empty now + OpContext op_ctx; + op_ctx.run_ctx = ctx; + if (src.storage_type() == kRowSparseStorage) { + std::vector inputs({src}), outputs({converted}); + op::CastStorageComputeEx({}, op_ctx, inputs, {}, outputs); + } else if (src.storage_type() == kDefaultStorage) { + std::vector inputs({src.data()}), outputs({converted.data()}); + op::IdentityCompute({}, op_ctx, inputs, {kWriteTo}, outputs); + } else { + LOG(FATAL) << "unsupported storage type"; + } + }, src.ctx(), {src.var()}, {converted.var()}, + FnProperty::kNormal, 0, PROFILER_MESSAGE_FUNCNAME); + converted.WaitToRead(); + return converted; +} diff --git a/tests/python/unittest/test_multi_device_exec.py b/tests/python/unittest/test_multi_device_exec.py index f80f40ba7c32..d5da9eefcae9 100644 --- a/tests/python/unittest/test_multi_device_exec.py +++ b/tests/python/unittest/test_multi_device_exec.py @@ -38,7 +38,7 @@ def check_ctx_group_sparse(mode='dense_sparse'): dense_np = np.array([[1,2],[3,4],[5,6]]) sparse_np1 = np.array([[5,10],[0,0],[0,0]]) dense_nd = mx.nd.array(dense_np) - val = mx.nd.array([5, 10]); + val = mx.nd.array([[5, 10]]); idx = mx.nd.array([0], dtype=np.int32); sparse_nd1 = mx.sparse_nd.row_sparse(val, idx, (3,2)) sparse_nd2 = mx.sparse_nd.row_sparse(val, idx, (3,2)) diff --git a/tests/python/unittest/test_optimizer.py b/tests/python/unittest/test_optimizer.py index ae2a29fddc38..ac2a697d40d2 100755 --- a/tests/python/unittest/test_optimizer.py +++ b/tests/python/unittest/test_optimizer.py @@ -35,15 +35,15 @@ def compare_optimizer(opt1, opt2, shape, w_stype='default', g_stype='default'): w2 = mx.random.uniform(shape=shape, ctx=default_context()) w1 = w2.copyto(default_context()) elif w_stype == 'row_sparse': - w2 = random_sparse_ndarray(shape, w_stype, allow_zeros = False) - w1 = random_sparse_ndarray(shape, w_stype, allow_zeros = False).to_dense() + w2 = rand_ndarray(shape, w_stype) + w1 = rand_ndarray(shape, w_stype).to_dense() else: raise Exception("type not supported yet") if g_stype == 'default': g2 = mx.random.uniform(shape=shape, ctx=default_context()) g1 = g2.copyto(default_context()) elif g_stype == 'row_sparse': - g2 = random_sparse_ndarray(shape, g_stype, allow_zeros = False) + g2 = rand_ndarray(shape, g_stype) g1 = g2.copyto(default_context()).to_dense() else: raise Exception("type not supported yet") diff --git a/tests/python/unittest/test_sparse_ndarray.py b/tests/python/unittest/test_sparse_ndarray.py index c089e002c242..e47c2b2a75a1 100644 --- a/tests/python/unittest/test_sparse_ndarray.py +++ b/tests/python/unittest/test_sparse_ndarray.py @@ -1,17 +1,17 @@ import os -import random import mxnet as mx import numpy as np import pickle as pkl from mxnet.test_utils import * from numpy.testing import assert_allclose +import numpy.random as rnd def check_sparse_nd_elemwise_binary(shapes, storage_types, f, g): # generate inputs nds = [] for i, storage_type in enumerate(storage_types): if storage_type == 'row_sparse': - nd = random_sparse_ndarray(shapes[i], storage_type, allow_zeros = False) + nd, _ = rand_sparse_ndarray(shapes[i], storage_type) elif storage_type == 'default': nd = mx.nd.array(random_arrays(shapes[i]), dtype = np.float32) else: @@ -26,7 +26,7 @@ def test_sparse_nd_elemwise_add(): g = lambda x,y: x + y op = mx.nd.elemwise_add for i in xrange(num_repeats): - shape = [(random.randint(1, 10),random.randint(1, 10))] * 2 + shape = [(rnd.randint(1, 10),rnd.randint(1, 10))] * 2 check_sparse_nd_elemwise_binary(shape, ['default'] * 2, op, g) check_sparse_nd_elemwise_binary(shape, ['default', 'row_sparse'], op, g) check_sparse_nd_elemwise_binary(shape, ['row_sparse', 'row_sparse'], op, g) @@ -37,48 +37,112 @@ def test_sparse_nd_elementwise_fallback(): g = lambda x,y: x + y op = mx.nd.add_n for i in xrange(num_repeats): - shape = [(random.randint(1, 10),random.randint(1, 10))] * 2 + shape = [(rnd.randint(1, 10), rnd.randint(1, 10))] * 2 check_sparse_nd_elemwise_binary(shape, ['default'] * 2, op, g) check_sparse_nd_elemwise_binary(shape, ['default', 'row_sparse'], op, g) check_sparse_nd_elemwise_binary(shape, ['row_sparse', 'row_sparse'], op, g) -def check_conversion_row_sparse(): - val = np.array([5, 10]) - idx = np.array([1]) - sparse_val = np.array([[0, 0], [5, 10], [0, 0], [0, 0], [0, 0]]) - a = mx.nd.array(val) - b = mx.nd.array(idx, dtype=np.int32) - d = mx.sparse_nd.array(a, [b], 'row_sparse', (5,2)) - f = mx.sparse_nd.to_dense(d) - assert_almost_equal(f.asnumpy(), sparse_val) - -def test_sparse_nd_conversion(): - check_conversion_row_sparse() - def test_sparse_nd_zeros(): - zero = mx.nd.zeros((2,2)) - sparse_zero = mx.sparse_nd.zeros((2,2), 'row_sparse') - assert_almost_equal(sparse_zero.asnumpy(), zero.asnumpy()) + def check_sparse_nd_zeros(shape, stype): + zero = mx.nd.zeros(shape) + sparse_zero = mx.sparse_nd.zeros('row_sparse', shape) + assert_almost_equal(sparse_zero.asnumpy(), zero.asnumpy()) + + shape = (rnd.randint(1, 10), rnd.randint(1, 10)) + check_sparse_nd_zeros(shape, 'row_sparse') + check_sparse_nd_zeros(shape, 'csr') -def check_sparse_nd_copy(storage_type): - c = random_sparse_ndarray((10, 10), storage_type, allow_zeros = True) - d = c.copyto(mx.Context('cpu', 0)) - assert np.sum(np.abs(c.asnumpy() != d.asnumpy())) == 0.0 def test_sparse_nd_copy(): - check_sparse_nd_copy('row_sparse') + def check_sparse_nd_copy(from_stype, to_stype): + shape = (rnd.randint(1, 10), rnd.randint(1, 10)) + from_nd = rand_ndarray(shape, from_stype) + # copy to ctx + to_ctx = from_nd.copyto(default_context()) + # copy to stype + to_nd = rand_ndarray(shape, to_stype) + to_nd = from_nd.copyto(to_nd) + assert np.sum(np.abs(from_nd.asnumpy() != to_ctx.asnumpy())) == 0.0 + assert np.sum(np.abs(from_nd.asnumpy() != to_nd.asnumpy())) == 0.0 -def test_sparse_nd_property(): + check_sparse_nd_copy('row_sparse', 'row_sparse') + check_sparse_nd_copy('row_sparse', 'default') + check_sparse_nd_copy('default', 'row_sparse') + +def check_sparse_nd_prop_rsp(): storage_type = 'row_sparse' - a = random_sparse_ndarray((10, 10), storage_type, allow_zeros = True) - assert(a.num_aux == 1) - assert(a.aux_type(0) == np.int32) - assert(a.storage_type == 'row_sparse') + shape = (rnd.randint(1, 2), rnd.randint(1, 2)) + nd, (v, idx) = rand_sparse_ndarray(shape, storage_type) + assert(nd._num_aux == 1) + assert(nd.indices.dtype == np.int32) + assert(nd.storage_type == 'row_sparse') + assert_almost_equal(nd._data().asnumpy(), v) + assert_almost_equal(nd._aux_data(0).asnumpy(), idx) + +def test_sparse_nd_basic(): + def check_rsp_creation(values, indices, shape): + rsp = mx.sparse_nd.row_sparse(values, indices, shape) + dns = mx.nd.zeros(shape) + dns[1] = mx.nd.array(values[0]) + dns[3] = mx.nd.array(values[1]) + assert_almost_equal(rsp.asnumpy(), dns.asnumpy()) + indices = mx.nd.array(indices).asnumpy() + assert_almost_equal(rsp.indices.asnumpy(), indices) + + def check_csr_creation(shape): + csr, (indptr, indices, values) = rand_sparse_ndarray(shape, 'csr') + assert_almost_equal(csr.indptr.asnumpy(), indptr) + assert_almost_equal(csr.indices.asnumpy(), indices) + assert_almost_equal(csr.values.asnumpy(), values) + + shape = (4,2) + values = np.random.rand(2,2) + indices = np.array([1,3]) + check_rsp_creation(values, indices, shape) + + values = mx.nd.array(np.random.rand(2,2)) + indices = mx.nd.array([1,3], dtype='int32') + check_rsp_creation(values, indices, shape) + + values = [[0.1, 0.2], [0.3, 0.4]] + indices = [1,3] + check_rsp_creation(values, indices, shape) + + check_csr_creation(shape) + check_sparse_nd_prop_rsp() + + +def test_sparse_nd_setitem(): + shape = (3, 4) + # ndarray assignment + x = mx.sparse_nd.zeros('row_sparse', shape) + x[:] = mx.nd.ones(shape) + x_np = np.ones(shape, dtype=x.dtype) + assert same(x.asnumpy(), x_np) + + # numpy assignment + x = mx.sparse_nd.zeros('row_sparse', shape) + x[:] = np.ones(shape) + x_np = np.ones(shape, dtype=x.dtype) + assert same(x.asnumpy(), x_np) + +def test_sparse_nd_slice(): + def check_sparse_nd_csr_slice(shape): + storage_type = 'csr' + A, _ = rand_sparse_ndarray(shape, storage_type) + A2 = A.asnumpy() + start = rnd.randint(0, shape[0] - 1) + end = rnd.randint(start + 1, shape[0]) + assert same(A[start:end].asnumpy(), A2[start:end]) + + shape = (rnd.randint(2, 10), rnd.randint(1, 10)) + check_sparse_nd_csr_slice(shape) if __name__ == '__main__': - test_sparse_nd_conversion() test_sparse_nd_zeros() test_sparse_nd_elementwise_fallback() - test_sparse_nd_elemwise_add() test_sparse_nd_copy() - test_sparse_nd_property() + test_sparse_nd_elemwise_add() + test_sparse_nd_setitem() + test_sparse_nd_basic() + test_sparse_nd_slice() diff --git a/tests/python/unittest/test_sparse_operator.py b/tests/python/unittest/test_sparse_operator.py index 6e7c97ca7f43..9ae86b25f94a 100644 --- a/tests/python/unittest/test_sparse_operator.py +++ b/tests/python/unittest/test_sparse_operator.py @@ -1,86 +1,48 @@ # pylint: skip-file import numpy as np import mxnet as mx +import scipy as sp from numpy.testing import assert_allclose from mxnet.test_utils import * - -def test_elemwise_add_dense(): - data1 = mx.symbol.Variable('data1') - data2 = mx.symbol.Variable('data2') - shape = (1, 1) - data1_tmp = np.ones(shape) - data2_tmp = np.zeros(shape) + 2 - test = mx.symbol.elemwise_add(data1, data2) - # check_numeric_gradient(test, [data_tmp]) - check_symbolic_forward(test, {'data1':data1_tmp, - 'data2':data2_tmp}, [data1_tmp + data2_tmp]) - #check_symbolic_backward(test, [data_tmp], [np.ones(shape)], [2 * data_tmp]) - arr_grad1 = mx.nd.empty(shape) - arr_grad2 = mx.nd.empty(shape) - # init grad arrays before bind - exec_test = test.bind(default_context(), args={'data1':mx.nd.array(data1_tmp), 'data2':mx.nd.array(data2_tmp)}, - args_grad=[arr_grad1, arr_grad2]) - exec_test.forward(is_train=True) - assert_almost_equal(exec_test.outputs[0].asnumpy(), data1_tmp + data2_tmp) - exec_test.backward(out_grads = exec_test.outputs) - assert_almost_equal(arr_grad1.asnumpy(), arr_grad2.asnumpy()) - - -def test_elemwise_add_dense_sparse(): - # prep data - dense_np = np.array([[1,2],[3,4],[5,6]]) - sparse_np1 = np.array([[5,10],[0,0],[0,0]]) - dense_nd = mx.nd.array(dense_np) - - val = mx.nd.array([5, 10]); - idx = mx.nd.array([0], dtype=np.int32); - sparse_nd1 = mx.sparse_nd.row_sparse(val, idx, (3,2)) - - data1 = mx.symbol.Variable('data1') - data2 = mx.symbol.Variable('data2', storage_type='row_sparse') - test = mx.symbol.elemwise_add(data1, data2, name='plus') - check_symbolic_forward(test, {'data1':dense_nd, - 'data2':sparse_nd1}, [dense_np + sparse_np1]) - - -def test_elemwise_add_sparse_sparse(): - # prep data - shape = (4, 2) - sparse_np1 = np.array([[5,10],[0,0],[0,0],[0,0]]) - sparse_np2 = np.array([[0,0],[5,10],[0,0],[0,0]]) - - val1 = mx.nd.array([5, 10]) - val2 = mx.nd.array([5, 10]) - idx1 = mx.nd.array([0], dtype=np.int32); - idx2 = mx.nd.array([1], dtype=np.int32); - sparse_nd1 = mx.sparse_nd.row_sparse(val1, idx1, shape) - sparse_nd2 = mx.sparse_nd.row_sparse(val2, idx2, shape) - - data1 = mx.symbol.Variable('data1', storage_type='row_sparse') - data2 = mx.symbol.Variable('data2', storage_type='row_sparse') - test = mx.symbol.elemwise_add(data1, data2, name='plus') - check_symbolic_forward(test, {'data1':sparse_nd1, - 'data2':sparse_nd2}, [sparse_np1 + sparse_np2]) - arr_grad1 = mx.sparse_nd.zeros(shape, 'row_sparse') - arr_grad2 = mx.sparse_nd.zeros(shape, 'row_sparse') - exec_test = test.bind(default_context(), args={'data1':sparse_nd1, 'data2':sparse_nd2}, - args_grad=[arr_grad1, arr_grad2]) - exec_test.forward(is_train=True) - assert_almost_equal(exec_test.outputs[0].asnumpy(), sparse_np1 + sparse_np2) - exec_test.backward(out_grads = exec_test.outputs) - assert_almost_equal(arr_grad1.asnumpy(), arr_grad2.asnumpy()) - - -def test_elemwise_add_multiple_stages(): +def check_elemwise_add_ex(lhs_stype, rhs_stype, shape, lhs_grad_stype=None, rhs_grad_stype=None): + lhs = mx.symbol.Variable('lhs', storage_type = lhs_stype) + rhs = mx.symbol.Variable('rhs', storage_type = rhs_stype) + if lhs_grad_stype is not None: + lhs._set_attr(grad_stype_hint=str(lhs_grad_stype)) + if rhs_grad_stype is not None: + rhs._set_attr(grad_stype_hint=str(rhs_grad_stype)) + + lhs_nd = rand_ndarray(shape, lhs_stype) + rhs_nd = rand_ndarray(shape, rhs_stype) + lhs_np = lhs_nd.asnumpy() + rhs_np = rhs_nd.asnumpy() + + out_np = lhs_np + rhs_np + test = mx.symbol.elemwise_add(lhs, rhs) + location = {'lhs':lhs_nd, 'rhs':rhs_nd} + check_symbolic_forward(test, location, [out_np]) + check_numeric_gradient(test, location) + check_symbolic_backward(test, location, [out_np], [out_np, out_np]) + +def test_elemwise_add_ex(): + shape = (rnd.randint(1, 10),rnd.randint(1, 10)) + check_elemwise_add_ex('default', 'default', shape) + check_elemwise_add_ex('default', 'row_sparse', shape) + check_elemwise_add_ex('row_sparse', 'default', shape) + check_elemwise_add_ex('row_sparse', 'row_sparse', shape, + lhs_grad_stype='row_sparse', rhs_grad_stype='row_sparse') + +# TODO(haibin) randomize this test +def test_elemwise_add_ex_multiple_stages(): # prep data shape = (4, 2) ds_np = np.array([[1,2],[3,4],[5,6],[7,8]]) sp_np1 = np.array([[5,10],[0,0],[0,0],[0,0]]) sp_np2 = np.array([[0,0],[5,10],[0,0],[0,0]]) - val1 = mx.nd.array([5, 10]); - val2 = mx.nd.array([5, 10]); + val1 = mx.nd.array([[5, 10]]); + val2 = mx.nd.array([[5, 10]]); idx1 = mx.nd.array([0], dtype=np.int32); idx2 = mx.nd.array([1], dtype=np.int32); sp_nd1 = mx.sparse_nd.row_sparse(val1, idx1, shape) @@ -105,37 +67,28 @@ def test_elemwise_add_multiple_stages(): exec_test.backward(out_grads = exec_test.outputs) assert_almost_equal(arr_grads[0].asnumpy(), arr_grads[1].asnumpy()) - -def test_cast_storage(): - def test_rsp_to_dns(data, row_idx, shape): - rsp = mx.sparse_nd.array(values=data, index_list=[row_idx], storage_type='row_sparse', shape=shape) +# TODO(haibin) also add test for backward pass +def test_cast_storage_ex(): + def test_rsp_to_dns(shape): + rsp, (data, row_idx) = rand_sparse_ndarray(shape, 'row_sparse') dns_out = mx.nd.cast_storage(rsp, storage_type='default') dns_expected = np.zeros(shape, dtype=default_dtype()) - for k, v in enumerate(row_idx): - dns_expected[v, :] = data[k] + if row_idx is not None: + for k, v in enumerate(row_idx): + dns_expected[v, :] = data[k] assert same(dns_out.asnumpy(), dns_expected) - def test_dns_to_rsp(dns_in): - dns_in = np.array(dns_in) + def test_dns_to_rsp(shape): + dns_in = rand_ndarray(shape, 'default') rsp_out = mx.nd.cast_storage(mx.nd.array(dns_in, dtype=default_dtype()), storage_type='row_sparse') ret = mx.nd.cast_storage(rsp_out, storage_type='default') - assert same(ret.asnumpy(), dns_in) + assert same(ret.asnumpy(), dns_in.asnumpy()) - def test_csr_to_dns(data, indptr, col_idx, shape): - indptr = np.array(indptr, dtype=np.int32) - col_idx = np.array(col_idx, dtype=np.int32) - csr = mx.sparse_nd.array(values=data, index_list=[col_idx, indptr], storage_type='csr', shape=shape, - aux_types=[np.int32, np.int32]) - dns_out = mx.nd.cast_storage(csr, storage_type='default') - dns_expected = np.zeros(shape, dtype=default_dtype()) - i = 0 - while i < len(indptr) - 1: - j = indptr[i] - while j < indptr[i+1]: - dns_expected[i, col_idx[j]] = data[j] - j = j + 1 - i = i + 1 - assert same(dns_out.asnumpy(), dns_expected) + def test_csr_to_dns(shape): + csr, (indptr, indices, values) = rand_sparse_ndarray(shape, 'csr') + mx_dns = csr.to_dense() + np_dns = sp.sparse.csr_matrix((values, indices, indptr), shape).todense() + assert_almost_equal(mx_dns.asnumpy(), np_dns) def test_dns_to_csr(dns_in): dns_in= np.array(dns_in) @@ -143,11 +96,10 @@ def test_dns_to_csr(dns_in): ret = mx.nd.cast_storage(csr_out, storage_type='default') assert same(ret.asnumpy(), dns_in) - test_rsp_to_dns([], [], (10, 3)) - test_rsp_to_dns([[1, 2], [3, 4], [5, 6], [7, 8]], [2, 4, 5, 7], (10, 2)) - test_dns_to_rsp([[0, 1, 0], [0, 2, 0], [3, 0, 0], [0, 0, 4], [5, 6, 0], [0, 0, 7]]) - test_csr_to_dns([], [0, 0, 0, 0, 0], [], (4, 4)) - test_csr_to_dns([5, 8, 3, 6], [0, 0, 2, 3, 4], [0, 1, 2, 1], (4, 4)) + shape = (rnd.randint(1, 10),rnd.randint(1, 10)) + test_rsp_to_dns(shape) + test_dns_to_rsp(shape) + test_csr_to_dns((4, 4)) test_dns_to_csr([[0, 1, 0], [0, 2, 0], [3, 0, 0], [0, 0, 4], [5, 6, 0], [0, 0, 7]]) # TODO(junwu): The backward of the operator dot cannot be tested for now @@ -157,38 +109,75 @@ def test_dns_to_csr(dns_in): # the same impl function of dot(csr, dns) = rsp and it has been tested # in the forward test cases as the following. def test_sparse_dot(): - def test_dot_csr_dns_rsp(dns1, dns2, trans_csr): - dns1 = mx.nd.array(dns1) - dns2 = mx.nd.array(dns2) + def test_dot_csr_dns_rsp(csr_shape, dns_shape, dns_grad_stype, trans_csr): + dns1 = rand_ndarray(csr_shape, 'default') + dns2 = rand_ndarray(dns_shape, 'default') csr = mx.nd.cast_storage(dns1, storage_type='csr') rsp_out = mx.nd.dot(csr, dns2, transpose_a=trans_csr) - rsp_expected = mx.nd.dot(csr.to_dense(), dns2, transpose_a=trans_csr) + rsp_expected = mx.nd.dot(dns1, dns2, transpose_a=trans_csr) + out_np = rsp_expected.asnumpy() + backward_trans = not trans_csr + rhs_backward_grad = mx.nd.dot(dns1, rsp_expected, transpose_a=backward_trans).asnumpy() # TODO(junwu): may need to compare rsp_out and rsp_expected in rsp format # instead of converting them to the dense format - assert same(rsp_out.asnumpy(), rsp_expected.asnumpy()) + assert same(rsp_out.asnumpy(), out_np) # test symbolic forward lhs = mx.symbol.Variable('lhs', storage_type='csr') rhs = mx.symbol.Variable('rhs', storage_type='default') - sym_dot = mx.symbol.dot(lhs, rhs, transpose_a=trans_csr) - dns2_grad = mx.sparse_nd.zeros(dns2.shape, 'row_sparse') - exec_dot = sym_dot.bind(default_context(), args={'lhs': csr, 'rhs': dns2}, args_grad={'rhs': dns2_grad}, - grad_req={'lhs': 'null', 'rhs': 'write'}) - exec_dot.forward(is_train=True) - assert same(exec_dot.outputs[0].asnumpy(), rsp_expected.asnumpy()) - - test_dot_csr_dns_rsp(dns1=[[0, 0, 1, 4], [2, 0, 0, 0], [0, 0, 0, 0], [2, 9, 0, 5], [0, 0, 0, 1]], - dns2=[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16]], - trans_csr=False) - test_dot_csr_dns_rsp(dns1=[[0, 0, 1, 4], [2, 0, 0, 0], [0, 0, 0, 0], [2, 9, 0, 5], [0, 0, 0, 1]], - dns2=[[1, 2, 3, 4, 5], [5, 6, 7, 8, 6], [9, 10, 11, 12, 6], [13, 14, 15, 16, 7], - [1, 1, 1, 1, 2]], trans_csr=True) + rhs._set_attr(grad_stype_hint=str(dns_grad_stype)) + # TODO(haibin) since backward op is not fully implemented, here we add a dense zero ndarray + # so that the output gradient is dense. + zeros = mx.symbol.Variable('zero', storage_type='default') + sym_dot = mx.symbol.dot(lhs, rhs, transpose_a=trans_csr) + test = mx.symbol.elemwise_add(sym_dot, zeros) + location = {'lhs':csr, 'rhs':dns2, 'zero':mx.nd.zeros(rsp_expected.shape)} + expected = {'rhs':rhs_backward_grad, 'zero':out_np} + # dot(lhs, rhs) + zeros + check_symbolic_forward(test, location, [rsp_expected.asnumpy()]) + check_symbolic_backward(test, location, [out_np], expected, + grad_req={'lhs': 'null', 'rhs': 'write', 'zero' : 'write'}) + + lhs_shape = (rnd.randint(1, 10),rnd.randint(1, 10)) + test_dot_csr_dns_rsp(lhs_shape, (lhs_shape[1], rnd.randint(1, 10)), 'row_sparse', False) + test_dot_csr_dns_rsp(lhs_shape, (lhs_shape[0], rnd.randint(1, 10)), 'row_sparse', True) + test_dot_csr_dns_rsp(lhs_shape, (lhs_shape[1], rnd.randint(1, 10)), 'default', False) + test_dot_csr_dns_rsp(lhs_shape, (lhs_shape[0], rnd.randint(1, 10)), 'default', True) + +''' +def test_sparse_embedding(): + in_dim = 10 + out_dim = 4 + batch = 24 + + data = mx.sym.Variable("data", dtype=np.int32) + embed = mx.sym.SparseEmbedding(data=data, input_dim=in_dim, output_dim=out_dim, name="embed") + # TODO(haibin) test again when simple_bind cpp api is ready + exe_test = embed.simple_bind(default_context(), grad_req={'data': 'null', 'embed_weight': 'write'}, + data=(batch,)) + arg_map = dict(zip(embed.list_arguments(), exe_test.arg_arrays)) + grad_map = dict(zip(embed.list_arguments(), exe_test.grad_arrays)) + np_data = np.random.randint(low=0, high=in_dim, size=batch) + np_weight = np.random.uniform(-0.01, 0.01, arg_map["embed_weight"].shape) + np_onehot = np.zeros((batch, in_dim)) + np_onehot[np.arange(batch), np_data] = 1.0 + # forward + arg_map["data"][:] = np_data + arg_map["embed_weight"][:] = np_weight + exe_test.forward(is_train=True) + assert_almost_equal(exe_test.outputs[0].asnumpy(), np.dot(np_onehot, np_weight)) + # backward + np_grad = np.random.uniform(-1, 1, exe_test.outputs[0].shape) + grad = mx.nd.zeros(np_grad.shape) + grad[:] = np_grad + exe_test.backward([grad]) + assert_almost_equal(grad_map["embed_weight"].asnumpy(), np.dot(np_onehot.T, np_grad)) +''' if __name__ == '__main__': - test_elemwise_add_dense() - test_elemwise_add_dense_sparse() - test_elemwise_add_sparse_sparse() - test_elemwise_add_multiple_stages() - test_cast_storage() + test_elemwise_add_ex() + test_elemwise_add_ex_multiple_stages() + test_cast_storage_ex() test_sparse_dot() + #test_sparse_embedding() diff --git a/tests/python/unittest/test_symbol.py b/tests/python/unittest/test_symbol.py index 414c1a1ddfe2..ab25f48eeb52 100644 --- a/tests/python/unittest/test_symbol.py +++ b/tests/python/unittest/test_symbol.py @@ -233,7 +233,6 @@ def test_zero_prop2(): test_symbol_infer_shape_var() test_symbol_infer_shape() test_symbol_infer_type() - #TODO test infer storage type test_symbol_internal() test_symbol_basic() test_symbol_compose()