diff --git a/include/mxnet/base.h b/include/mxnet/base.h index e3fbe002fdfc..7e5b6d06f0d8 100644 --- a/include/mxnet/base.h +++ b/include/mxnet/base.h @@ -33,6 +33,9 @@ #define MXNET_USE_CUDNN MSHADOW_USE_CUDNN #endif +/*! \brief Error message for using gpu when MXNET_USE_CUDA==0 */ +#define MXNET_GPU_NOT_ENABLED_ERROR "GPU is not enabled" + /*! \brief namespace of mxnet */ namespace mxnet { /*! \brief mxnet cpu */ @@ -50,7 +53,6 @@ typedef mshadow::TShape TShape; typedef mshadow::TBlob TBlob; } // namespace mxnet - //! \cond Doxygen_Suppress namespace dmlc { // Add a few patches to support TShape in dmlc/parameter. diff --git a/include/mxnet/context.h b/include/mxnet/context.h index e7e2e2b0e44b..c0a712bc8ec8 100644 --- a/include/mxnet/context.h +++ b/include/mxnet/context.h @@ -83,6 +83,15 @@ struct RunContext { * \brief the stream of the device, can be NULL or Stream* in GPU mode */ void *stream; + /*! + * \brief get mshadow stream from Context + * \return the mshadow stream + * \tparam xpu the device type of the stream + */ + template + inline mshadow::Stream* get_stream() const { + return static_cast*>(stream); + } }; /*! diff --git a/include/mxnet/operator.h b/include/mxnet/operator.h index 64c1515a6f3b..0842f53d347e 100644 --- a/include/mxnet/operator.h +++ b/include/mxnet/operator.h @@ -53,7 +53,7 @@ struct OpContext { */ template inline mshadow::Stream* get_stream() const { - return static_cast*>(run_ctx.stream); + return run_ctx.get_stream(); } }; diff --git a/src/kvstore/kvstore_local.h b/src/kvstore/kvstore_local.h index 34c206bfc6de..bea6cb019356 100644 --- a/src/kvstore/kvstore_local.h +++ b/src/kvstore/kvstore_local.h @@ -19,7 +19,12 @@ namespace mxnet { */ class KVStoreLocal : public KVStore { public: - KVStoreLocal() : pinned_ctx_(cpu::kDevMask, Context::kPinnedMemoryID) { + KVStoreLocal() { +#if MXNET_USE_CUDA + pinned_ctx_ = Context(cpu::kDevMask, Context::kPinnedMemoryID); +#else + pinned_ctx_ = Context(cpu::kDevMask, 0); +#endif Clear(); } @@ -44,11 +49,7 @@ class KVStoreLocal : public KVStore { for (size_t i = 0; i < keys.size(); ++i) { CHECK(local_.find(keys[i]) == local_.end()) << "duplicate init of key " << keys[i]; -#if MXNET_USE_CUDA local_.insert({keys[i], values[i].Copy(pinned_ctx_)}); -#else - local_.insert({keys[i], values[i].Copy(local_ctx_)}); -#endif // MXNET_USE_CUDA } } @@ -121,7 +122,7 @@ class KVStoreLocal : public KVStore { CHECK(val.size()); auto& buf = merge_buf_[key]; if (buf.merged.is_none()) { - buf.merged = val[0].Copy(local_ctx_); + buf.merged = val[0].Copy(pinned_ctx_); } else { CopyFromTo(val[0], &buf.merged); } @@ -167,7 +168,6 @@ class KVStoreLocal : public KVStore { /// \brief local storage std::unordered_map local_; - Context local_ctx_; Context pinned_ctx_; Updater updater_; diff --git a/src/narray/narray.cc b/src/narray/narray.cc index 7bce1d5c5243..22097eb46e86 100644 --- a/src/narray/narray.cc +++ b/src/narray/narray.cc @@ -5,6 +5,7 @@ */ #include #include +#include #include #include #include "./narray_function.h" @@ -42,45 +43,34 @@ inline void BinaryOp(const NArray &lhs, } // important: callback must always capture by value NArray ret = *out; + // get the const variables + std::vector const_vars; + if (lhs.ptr_->var != ret.ptr_->var) const_vars.push_back(lhs.ptr_->var); + if (rhs.ptr_->var != ret.ptr_->var) const_vars.push_back(rhs.ptr_->var); + // redirect everything to mshadow operations switch (lhs.ctx().dev_mask) { case cpu::kDevMask: { - auto func = [lhs, rhs, ret](RunContext ctx) { - ret.ptr_->CheckAndAlloc(); - TBlob tmp = ret.data(); - narray::Eval(lhs.data(), rhs.data(), &tmp, ctx); - }; - if (lhs.ptr_->var == ret.ptr_->var && rhs.ptr_->var == ret.ptr_->var) { - Engine::Get()->Push(func, lhs.ctx(), {}, {ret.ptr_->var}); - } else if (lhs.ptr_->var == ret.ptr_->var) { - Engine::Get()->Push(func, lhs.ctx(), {rhs.ptr_->var}, {ret.ptr_->var}); - } else if (rhs.ptr_->var == ret.ptr_->var) { - Engine::Get()->Push(func, lhs.ctx(), {lhs.ptr_->var}, {ret.ptr_->var}); - } else { - Engine::Get()->Push(func, lhs.ctx(), {lhs.ptr_->var, rhs.ptr_->var}, {ret.ptr_->var}); - } + Engine::Get()->Push([lhs, rhs, ret](RunContext ctx) { + ret.ptr_->CheckAndAlloc(); + TBlob tmp = ret.data(); + narray::Eval(lhs.data(), rhs.data(), &tmp, ctx); + }, lhs.ctx(), const_vars, {ret.ptr_->var}); break; } #if MXNET_USE_CUDA case gpu::kDevMask: { - auto func = [lhs, rhs, ret](RunContext ctx) { - ret.ptr_->CheckAndAlloc(); - TBlob tmp = ret.data(); - narray::Eval(lhs.data(), rhs.data(), &tmp, ctx); - }; - if (lhs.ptr_->var == ret.ptr_->var && rhs.ptr_->var == ret.ptr_->var) { - Engine::Get()->Push(func, lhs.ctx(), {}, {ret.ptr_->var}); - } else if (lhs.ptr_->var == ret.ptr_->var) { - Engine::Get()->Push(func, lhs.ctx(), {rhs.ptr_->var}, {ret.ptr_->var}); - } else if (rhs.ptr_->var == ret.ptr_->var) { - Engine::Get()->Push(func, lhs.ctx(), {lhs.ptr_->var}, {ret.ptr_->var}); - } else { - Engine::Get()->Push(func, lhs.ctx(), {lhs.ptr_->var, rhs.ptr_->var}, {ret.ptr_->var}); - } + Engine::Get()->Push([lhs, rhs, ret](RunContext ctx) { + ret.ptr_->CheckAndAlloc(); + TBlob tmp = ret.data(); + narray::Eval(lhs.data(), rhs.data(), &tmp, ctx); + // Wait GPU kernel to complete + ctx.get_stream()->Wait(); + }, lhs.ctx(), const_vars, {ret.ptr_->var}); break; } #endif - default: LOG(FATAL) << "GPU is not enabled"; + default: LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; } } @@ -90,26 +80,26 @@ inline void SetValueOp(const real_t &rhs, NArray *out) { NArray ret = *out; switch (ret.ctx().dev_mask) { case cpu::kDevMask: { - auto func = [rhs, ret](RunContext ctx) { - ret.ptr_->CheckAndAlloc(); - TBlob tmp = ret.data(); - narray::Eval(rhs, &tmp, ctx); - }; - Engine::Get()->Push(func, ret.ctx(), {}, {ret.ptr_->var}); + Engine::Get()->Push([rhs, ret](RunContext ctx) { + ret.ptr_->CheckAndAlloc(); + TBlob tmp = ret.data(); + narray::Eval(rhs, &tmp, ctx); + }, ret.ctx(), {}, {ret.ptr_->var}); break; } #if MXNET_USE_CUDA case gpu::kDevMask: { - auto func = [rhs, ret](RunContext ctx) { - ret.ptr_->CheckAndAlloc(); - TBlob tmp = ret.data(); - narray::Eval(rhs, &tmp, ctx); - }; - Engine::Get()->Push(func, ret.ctx(), {}, {ret.ptr_->var}); + Engine::Get()->Push([rhs, ret](RunContext ctx) { + ret.ptr_->CheckAndAlloc(); + TBlob tmp = ret.data(); + narray::Eval(rhs, &tmp, ctx); + // Wait GPU kernel to complete + ctx.get_stream()->Wait(); + }, ret.ctx(), {}, {ret.ptr_->var}); break; } #endif - default: LOG(FATAL) << "GPU is not enabled"; + default: LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; } } /*! @@ -124,45 +114,40 @@ inline void ScalarOp(const NArray &lhs, const real_t &rhs, NArray *out) { if (out->is_none()) { - *out = NArray(OP::GetShape(lhs.shape(), lhs.shape()), lhs.ctx(), true); + *out = NArray(lhs.shape(), lhs.ctx(), true); } else { CHECK(out->ctx() == lhs.ctx()) << "target context mismatch"; - CHECK(out->shape() == OP::GetShape(lhs.shape(), lhs.shape())) - << "target shape mismatch"; + CHECK(out->shape() == lhs.shape()) << "target shape mismatch"; } // important: callback must always capture by value NArray ret = *out; + // get the const variables + std::vector const_vars; + if (lhs.ptr_->var != ret.ptr_->var) const_vars.push_back(lhs.ptr_->var); + // redirect everything to mshadow operations switch (lhs.ctx().dev_mask) { case cpu::kDevMask: { - auto func = [lhs, rhs, ret](RunContext ctx) { - ret.ptr_->CheckAndAlloc(); - TBlob tmp = ret.data(); - narray::Eval(lhs.data(), rhs, &tmp, ctx); - }; - if (lhs.ptr_->var == ret.ptr_->var) { - Engine::Get()->Push(func, lhs.ctx(), {}, {ret.ptr_->var}); - } else { - Engine::Get()->Push(func, lhs.ctx(), {lhs.ptr_->var}, {ret.ptr_->var}); - } + Engine::Get()->Push([lhs, rhs, ret](RunContext ctx) { + ret.ptr_->CheckAndAlloc(); + TBlob tmp = ret.data(); + narray::Eval(lhs.data(), rhs, &tmp, ctx); + }, lhs.ctx(), const_vars, {ret.ptr_->var}); break; } #if MXNET_USE_CUDA case gpu::kDevMask: { - auto func = [lhs, rhs, ret](RunContext ctx) { - ret.ptr_->CheckAndAlloc(); - TBlob tmp = ret.data(); - narray::Eval(lhs.data(), rhs, &tmp, ctx); - }; - if (lhs.ptr_->var == ret.ptr_->var) { - Engine::Get()->Push(func, lhs.ctx(), {}, {ret.ptr_->var}); - } else { - Engine::Get()->Push(func, lhs.ctx(), {lhs.ptr_->var}, {ret.ptr_->var}); - } + Engine::Get()->Push([lhs, rhs, ret](RunContext ctx) { + ret.ptr_->CheckAndAlloc(); + TBlob tmp = ret.data(); + narray::Eval(lhs.data(), rhs, &tmp, ctx); + // Wait GPU kernel to complete + ctx.get_stream()->Wait(); + }, lhs.ctx(), const_vars, {ret.ptr_->var}); break; } #endif - default: LOG(FATAL) << "GPU is not enabled"; + default: LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; } } @@ -175,48 +160,52 @@ void CopyFromTo(const NArray &from, NArray *to) { NArray ret = *to; int a = from.ctx().dev_mask; int b = to->ctx().dev_mask; + + std::vector const_vars; + if (from.ptr_->var != ret.ptr_->var) const_vars.push_back(from.ptr_->var); + if (a == cpu::kDevMask && b == cpu::kDevMask) { Engine::Get()->Push([from, ret](RunContext ctx) { ret.ptr_->CheckAndAlloc(); TBlob tmp = ret.data(); narray::Copy(from.data(), &tmp, from.ctx(), ret.ctx(), ctx); - }, from.ctx(), {from.ptr_->var}, {ret.ptr_->var}); - } else if (a == cpu::kDevMask && b == gpu::kDevMask) { -#if MXNET_USE_CUDA - Engine::Get()->Push([from, ret](RunContext ctx) { - ret.ptr_->CheckAndAlloc(); - TBlob tmp = ret.data(); - narray::Copy(from.data(), &tmp, - from.ctx(), ret.ctx(), ctx); - }, ret.ctx(), {from.ptr_->var}, {ret.ptr_->var}); -#else - LOG(FATAL) << "GPU is not enabled"; -#endif - } else if (a == gpu::kDevMask && b == cpu::kDevMask) { -#if MXNET_USE_CUDA - Engine::Get()->Push([from, ret](RunContext ctx) { - ret.ptr_->CheckAndAlloc(); - TBlob tmp = ret.data(); - narray::Copy(from.data(), &tmp, - from.ctx(), ret.ctx(), ctx); - }, from.ctx(), {from.ptr_->var}, {ret.ptr_->var}); -#else - LOG(FATAL) << "GPU is not enabled"; -#endif - } else if (a == gpu::kDevMask && b == gpu::kDevMask) { + }, from.ctx(), const_vars, {ret.ptr_->var}); + } else { #if MXNET_USE_CUDA - Engine::Get()->Push([from, ret](RunContext ctx) { - ret.ptr_->CheckAndAlloc(); - TBlob tmp = ret.data(); - narray::Copy(from.data(), &tmp, - from.ctx(), ret.ctx(), ctx); - }, from.ctx(), {from.ptr_->var}, {ret.ptr_->var}); + if (a == cpu::kDevMask && b == gpu::kDevMask) { + Engine::Get()->Push([from, ret](RunContext ctx) { + ret.ptr_->CheckAndAlloc(); + TBlob tmp = ret.data(); + narray::Copy(from.data(), &tmp, + from.ctx(), ret.ctx(), ctx); + // Wait GPU kernel to complete + ctx.get_stream()->Wait(); + }, ret.ctx(), const_vars, {ret.ptr_->var}); + } else if (a == gpu::kDevMask && b == cpu::kDevMask) { + Engine::Get()->Push([from, ret](RunContext ctx) { + ret.ptr_->CheckAndAlloc(); + TBlob tmp = ret.data(); + narray::Copy(from.data(), &tmp, + from.ctx(), ret.ctx(), ctx); + // Wait GPU kernel to complete + ctx.get_stream()->Wait(); + }, from.ctx(), const_vars, {ret.ptr_->var}); + } else if (a == gpu::kDevMask && b == gpu::kDevMask) { + Engine::Get()->Push([from, ret](RunContext ctx) { + ret.ptr_->CheckAndAlloc(); + TBlob tmp = ret.data(); + narray::Copy(from.data(), &tmp, + from.ctx(), ret.ctx(), ctx); + // Wait GPU kernel to complete + ctx.get_stream()->Wait(); + }, from.ctx(), const_vars, {ret.ptr_->var}); + } else { + LOG(FATAL) << "unknown device mask"; + } #else - LOG(FATAL) << "GPU is not enabled"; + LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; #endif - } else { - LOG(FATAL) << "unknown device mask"; } } diff --git a/src/symbol/graph_executor.cc b/src/symbol/graph_executor.cc index 863355c28937..1211f1a4abb4 100644 --- a/src/symbol/graph_executor.cc +++ b/src/symbol/graph_executor.cc @@ -169,7 +169,6 @@ inline std::vector > GraphExecutor::GetInplaceOption( inline GraphExecutor::OpExecEntry GraphExecutor::GetOpExecEntry(uint32_t nid) { OpNode& op_node = op_nodes_[nid]; - Operator *op = op_node.op.get(); std::vector req; std::vector in_data, out_data, aux_states; in_data.reserve(graph_.nodes[nid].inputs.size()); @@ -199,14 +198,30 @@ GraphExecutor::GetOpExecEntry(uint32_t nid) { } } + // start setup exec function. + Operator* op = op_node.op.get(); OpContext* op_ctx_ptr = &op_node.op_ctx; - exec.exec_fun = [op, op_ctx_ptr, in_data, req, out_data, aux_states] (RunContext ctx) { + bool is_gpu = op_node.ctx.dev_mask == gpu::kDevMask; + exec.exec_fun = [op, is_gpu, op_ctx_ptr, in_data, req, out_data, aux_states] (RunContext ctx) { op_ctx_ptr->run_ctx = ctx; op->Forward(*op_ctx_ptr, in_data, req, out_data, aux_states); + if (is_gpu) { +#if MXNET_USE_CUDA + // Wait GPU kernel to finish. + ctx.get_stream()->Wait(); +#else + LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; +#endif + } }; return exec; } +GraphExecutor::~GraphExecutor() { + // need to destruct after all previously issued operations are finished. + Engine::Get()->WaitForAll(); +} + void GraphExecutor::InitGraph(Symbol symbol, Context ctx, bool need_backward) { // initialize all internal data structures symbol.ToStaticGraph(&graph_); diff --git a/src/symbol/graph_executor.h b/src/symbol/graph_executor.h index 074fafa0c571..5db11fcdf779 100644 --- a/src/symbol/graph_executor.h +++ b/src/symbol/graph_executor.h @@ -19,7 +19,7 @@ namespace mxnet { */ class GraphExecutor : public Executor { public: - virtual ~GraphExecutor() {} + virtual ~GraphExecutor(); virtual void Forward(bool is_train); virtual void Backward(const std::vector &head_grads); virtual const std::vector &heads() const { diff --git a/src/symbol/graph_memory_allocator.h b/src/symbol/graph_memory_allocator.h index b7bd2db2081e..9c995cd29993 100644 --- a/src/symbol/graph_memory_allocator.h +++ b/src/symbol/graph_memory_allocator.h @@ -56,7 +56,7 @@ class GraphStorageAllocator { */ NArray Get(StorageID id, TShape shape); - private: + protected: /*! \brief internal storage entry */ struct StorageEntry { /*! \brief id of the storage */