Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

[mkldnn-v1.0] Add MKL-DNN Convolution #16141

Merged
merged 6 commits into from
Sep 18, 2019
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions include/mxnet/ndarray.h
Original file line number Diff line number Diff line change
Expand Up @@ -761,8 +761,8 @@ class NDArray {
* It changes the layout of this NDArray, but it happens after all accesses to
* the array are complete.
*/
void Reorder2DefaultAsync();
void MKLDNNDataReorderAsync(const mkldnn::memory::desc &md);
void Reorder2DefaultAsync() const;
void MKLDNNDataReorderAsync(const mkldnn::memory::desc &md) const;

/*
* This creates a new NDArray with the reordered data.
Expand Down
8 changes: 4 additions & 4 deletions src/common/exec_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,15 +59,15 @@ inline bool SetupDefaultBlobsIn(const std::vector<NDArray>& src,
for (size_t i = 0; i < src.size(); i++) {
auto& nd = src[i];
bool is_default = nd.storage_type() == kDefaultStorage;
#if MXNET_USE_MKLDNN == 1
#if MXNET_USE_MKLDNN == 100
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we upgrade mkldnn to version 1.1, shall we have to change this to something like 110? I prefer not encoding the version into this macro.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No. Here we simply use 100 and 1 to distinguish the code for MKL-DNN v0.x and v1.x so we don't need to remove v0.x integration code and hence convenient to review. When merging this branch back to master, we will change all MXNET_USE_MKLDNN == 100 back to MXNET_USE_MKLDNN == 1.

// We have to make sure it's default storage and default layout.
is_default = nd.IsDefaultData();
#endif
if (!is_default) {
(*idx_map)[i] = temp_dst->size();
NDArray temp = bufs != nullptr ? bufs->at(i) : NDArray(nd.shape(), nd.ctx(),
true, nd.dtype());
#if MXNET_USE_MKLDNN == 1
#if MXNET_USE_MKLDNN == 100
CHECK(temp.IsDefaultData());
#endif
temp_src->emplace_back(nd);
Expand All @@ -91,7 +91,7 @@ inline bool SetupDefaultBlobsOut(const std::vector<NDArray>& src,
for (size_t i = 0; i < src.size(); i++) {
auto& nd = src[i];
bool is_default = nd.storage_type() == kDefaultStorage;
#if MXNET_USE_MKLDNN == 1
#if MXNET_USE_MKLDNN == 100
if (req->at(i) == kWriteInplace && nd.IsMKLDNNData())
// If it's write inplace and the output array doesn't use the default
// layout, we'll generate a temporary output array below, which means
Expand All @@ -102,7 +102,7 @@ inline bool SetupDefaultBlobsOut(const std::vector<NDArray>& src,
is_default = nd.IsDefaultData();
#endif
if (!is_default) {
#if MXNET_USE_MKLDNN == 1
#if MXNET_USE_MKLDNN == 100
NDArray temp;
if (bufs != nullptr) {
temp = bufs->at(i);
Expand Down
8 changes: 4 additions & 4 deletions src/executor/attach_op_execs_pass.cc
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ class StatefulComputeExecutor : public StorageFallbackOpExecutor {
public:
void Run(RunContext rctx, bool is_gpu) override {
op_ctx.run_ctx = rctx;
#if MXNET_USE_MKLDNN == 1
#if MXNET_USE_MKLDNN == 100
InvalidateOutputs(out_array, req);
#endif
PreFCompute(is_gpu);
Expand Down Expand Up @@ -155,7 +155,7 @@ class StatefulComputeExExecutor : public OpExecutor {
public:
void Run(RunContext rctx, bool is_gpu) override {
op_ctx.run_ctx = rctx;
#if MXNET_USE_MKLDNN == 1
#if MXNET_USE_MKLDNN == 100
InvalidateOutputs(out_array, req);
// TODO(alex): (MXNET-847) Remove this fallback feature after subgraph implemented
const auto is_mkldnn = Op::GetAttr<bool>("TIsMKLDNN");
Expand Down Expand Up @@ -202,7 +202,7 @@ class FComputeExecutor : public StorageFallbackOpExecutor {
void Run(RunContext rctx, bool is_gpu) override {
using namespace common;
op_ctx.run_ctx = rctx;
#if MXNET_USE_MKLDNN == 1
#if MXNET_USE_MKLDNN == 100
InvalidateOutputs(out_array, req);
#endif
PreFCompute(is_gpu);
Expand Down Expand Up @@ -231,7 +231,7 @@ class FComputeExExecutor : public OpExecutor {
public:
void Run(RunContext rctx, bool is_gpu) override {
op_ctx.run_ctx = rctx;
#if MXNET_USE_MKLDNN == 1
#if MXNET_USE_MKLDNN == 100
InvalidateOutputs(out_array, req);
// TODO(alex): (MXNET-847) Remove this fallback feature after subgraph implemented
const auto is_mkldnn = Op::GetAttr<bool>("TIsMKLDNN");
Expand Down
20 changes: 15 additions & 5 deletions src/imperative/imperative_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -418,7 +418,7 @@ inline void PushFCompute(const FCompute& fn,
std::vector<NDArray> pre_temp_src, pre_temp_dst, post_temp_dst, post_temp_src;
// mapping from index in input_blobs to index in pre_temp_dst
std::unordered_map<uint32_t, uint32_t> in_temp_idx_map;
#if MXNET_USE_MKLDNN == 1
#if MXNET_USE_MKLDNN == 100
if (exec_type != ExecType::kCrossDeviceCopy) {
// kCrossDeviceCopy is used for `_copy_to` operator, which doesn't compute immediately in
// its FCcomputeEx, but AsyncPush the copy operation to engine.
Expand Down Expand Up @@ -467,7 +467,7 @@ inline void PushFComputeEx(const FComputeEx& fn,
DerefInputOutput(p_inputs, p_outputs, &inputs, &outputs);
const auto& run = [=](RunContext rctx) {
OpContext opctx{need_grad, is_train, rctx, engine::CallbackOnComplete(), requested};
#if MXNET_USE_MKLDNN == 1
#if MXNET_USE_MKLDNN == 100
if (exec_type != ExecType::kCrossDeviceCopy) {
// kCrossDeviceCopy is used for `_copy_to` operator, which doesn't compute immediately in
// its FCcomputeEx, but AsyncPush the copy operation to engine.
Expand All @@ -476,8 +476,18 @@ inline void PushFComputeEx(const FComputeEx& fn,
// copying A to B may not happen, and will corrupt A's memory.
InvalidateOutputs(outputs, req);
}
// add for mkldnn OP + no mkldnn OP
const auto is_mkldnn = Op::GetAttr<bool>("TIsMKLDNN");
if (!is_mkldnn.get(attrs.op, false)) {
std::vector<NDArray> inputs_fallback;
CreateDefaultInputs(inputs, &inputs_fallback);
fn(attrs, opctx, inputs_fallback, req, outputs);
} else {
#endif
fn(attrs, opctx, inputs, req, outputs);
#if MXNET_USE_MKLDNN == 100
}
#endif
fn(attrs, opctx, inputs, req, outputs);
if (ctx.dev_mask() == gpu::kDevMask && exec_type == ExecType::kSync && !rctx.is_bulk) {
rctx.get_stream<gpu>()->Wait();
}
Expand Down Expand Up @@ -521,7 +531,7 @@ inline void PushOperator(const OpStatePtr& state,
const auto& run = [=](RunContext rctx,
engine::CallbackOnComplete on_complete) {
OpContext opctx{need_grad, is_train, rctx, on_complete, requested};
#if MXNET_USE_MKLDNN == 1
#if MXNET_USE_MKLDNN == 100
if (exec_type != ExecType::kCrossDeviceCopy) {
// kCrossDeviceCopy is used for `_copy_to` operator, which doesn't compute immediately in
// its FCcomputeEx, but AsyncPush the copy operation to engine.
Expand Down Expand Up @@ -567,7 +577,7 @@ inline void PushOperator(const OpStatePtr& state,
std::vector<NDArray> pre_temp_src, pre_temp_dst, post_temp_dst, post_temp_src;
// mapping from index in input_blobs to index in pre_temp_dst
std::unordered_map<uint32_t, uint32_t> in_temp_idx_map;
#if MXNET_USE_MKLDNN == 1
#if MXNET_USE_MKLDNN == 100
if (exec_type != ExecType::kCrossDeviceCopy) {
// kCrossDeviceCopy is used for `_copy_to` operator, which doesn't compute immediately in
// its FCcomputeEx, but AsyncPush the copy operation to engine.
Expand Down
4 changes: 2 additions & 2 deletions src/ndarray/ndarray.cc
Original file line number Diff line number Diff line change
Expand Up @@ -592,7 +592,7 @@ NDArray NDArray::Reorder2Default() const {
return ret;
}

void NDArray::Reorder2DefaultAsync() {
void NDArray::Reorder2DefaultAsync() const {
std::vector<Engine::VarHandle> const_vars;
std::vector<Engine::VarHandle> mutable_vars(1, this->var());
NDArray tmp = *this;
Expand All @@ -604,7 +604,7 @@ void NDArray::Reorder2DefaultAsync() {
FnProperty::kNormal, 0, "Reorder2Default");
}

void NDArray::MKLDNNDataReorderAsync(const mkldnn::memory::desc &desc) {
void NDArray::MKLDNNDataReorderAsync(const mkldnn::memory::desc &desc) const {
std::vector<Engine::VarHandle> const_vars;
std::vector<Engine::VarHandle> mutable_vars(1, this->var());
NDArray tmp = *this;
Expand Down
20 changes: 11 additions & 9 deletions src/operator/nn/convolution.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
#if MXNET_USE_NNPACK == 1
#include "../nnpack/nnpack_pooling-inl.h"
#endif // MXNET_USE_NNPACK
#if MXNET_USE_MKLDNN == 1
#if MXNET_USE_MKLDNN == 100
#include "./mkldnn/mkldnn_base-inl.h"
#include "./mkldnn/mkldnn_ops-inl.h"
#endif // MXNET_USE_MKLDNN
Expand All @@ -51,7 +51,7 @@ static inline std::vector<std::string> ListArguments(const ConvolutionParam& par
}
}

#if MXNET_USE_MKLDNN == 1
#if MXNET_USE_MKLDNN == 100
static void ConvolutionComputeExCPU(const nnvm::NodeAttrs& attrs,
const OpContext& ctx,
const std::vector<NDArray>& inputs,
Expand All @@ -60,7 +60,8 @@ static void ConvolutionComputeExCPU(const nnvm::NodeAttrs& attrs,
const ConvolutionParam& params = nnvm::get<ConvolutionParam>(attrs.parsed);
if (SupportMKLDNNConv(params, inputs[0])) {
MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
MKLDNNConvolutionForward(attrs, ctx, inputs, req, outputs);
const auto mkldnn_inputs = GetMKLDNNInputArray(inputs);
MKLDNNConvolutionForward(attrs, ctx, mkldnn_inputs, req, outputs);
MKLDNN_OPCHECK_RUN(ConvolutionCompute<cpu>, attrs, ctx, inputs, req, outputs);
return;
}
Expand All @@ -75,7 +76,8 @@ static void ConvolutionGradComputeExCPU(const nnvm::NodeAttrs& attrs,
const ConvolutionParam& params = nnvm::get<ConvolutionParam>(attrs.parsed);
if (SupportMKLDNNConv(params, inputs[0])) {
MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs);
MKLDNNConvolutionBackward(attrs, ctx, inputs, req, outputs);
const auto mkldnn_inputs = GetMKLDNNInputArray(inputs);
MKLDNNConvolutionBackward(attrs, ctx, mkldnn_inputs, req, outputs);
MKLDNN_OPCHECK_RUN(ConvolutionGradCompute<cpu>, attrs, ctx, inputs, req, outputs);
return;
}
Expand Down Expand Up @@ -302,7 +304,7 @@ static bool ConvolutionType(const nnvm::NodeAttrs& attrs,
return true;
}

#if MXNET_USE_MKLDNN == 1
#if MXNET_USE_MKLDNN == 100
inline static bool ConvStorageType(const nnvm::NodeAttrs& attrs,
const int dev_mask,
DispatchMode* dispatch_mode,
Expand Down Expand Up @@ -491,11 +493,11 @@ There are other options to tune the performance.
})
.set_attr<mxnet::FInferShape>("FInferShape", ConvolutionShape)
.set_attr<nnvm::FInferType>("FInferType", ConvolutionType)
#if MXNET_USE_MKLDNN == 1
#if MXNET_USE_MKLDNN == 100
.set_attr<FInferStorageType>("FInferStorageType", ConvStorageType)
#endif
.set_attr<FCompute>("FCompute<cpu>", ConvolutionCompute<cpu>)
#if MXNET_USE_MKLDNN == 1
#if MXNET_USE_MKLDNN == 100
.set_attr<bool>("TIsMKLDNN", true)
.set_attr<FComputeEx>("FComputeEx<cpu>", ConvolutionComputeExCPU)
#endif
Expand All @@ -514,14 +516,14 @@ NNVM_REGISTER_OP(_backward_Convolution)
return params.no_bias ? 2 : 3;
})
.set_attr<nnvm::TIsBackward>("TIsBackward", true)
#if MXNET_USE_MKLDNN == 1
#if MXNET_USE_MKLDNN == 100
.set_attr<FInferStorageType>("FInferStorageType", BackwardConvStorageType)
#endif
.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
})
.set_attr_parser(ConvolutionParamParser)
#if MXNET_USE_MKLDNN == 1
#if MXNET_USE_MKLDNN == 100
.set_attr<bool>("TIsMKLDNN", true)
.set_attr<FComputeEx>("FComputeEx<cpu>", ConvolutionGradComputeExCPU)
#endif
Expand Down
13 changes: 13 additions & 0 deletions src/operator/nn/mkldnn/mkldnn_base-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,19 @@ inline static mkldnn::memory::desc GetWeightDesc(const NDArray &arr,
}
}

inline static const std::vector<NDArray> GetMKLDNNInputArray(const std::vector<NDArray> &inputs) {
std::vector<NDArray> ret;
ret.reserve(inputs.size());
for (const auto &in : inputs) {
if (in.IsView() && in.IsMKLDNNData()) {
ret.push_back(in.Reorder2Default());
} else {
ret.push_back(in);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Return inputs here to avoid overhead?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pls let author @ZhennanQin help to check

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, we can do that.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

}
}
return ret;
}

typedef std::shared_ptr<mkldnn::memory> mkldnn_mem_ptr;
typedef std::shared_ptr<const mkldnn::memory> mkldnn_mem_const_ptr;

Expand Down
2 changes: 0 additions & 2 deletions src/operator/nn/mkldnn/mkldnn_base.cc
Original file line number Diff line number Diff line change
Expand Up @@ -530,13 +530,11 @@ bool MKLDNNStorageType(const nnvm::NodeAttrs &attrs,
if (v == - 1) v = kDefaultStorage;

DispatchMode wanted_mode;
#if MXNET_USE_MKLDNN == 1
TaoLv marked this conversation as resolved.
Show resolved Hide resolved
if (dev_mask == mshadow::cpu::kDevMask && !MKLDNNEnvSet())
wanted_mode = DispatchMode::kFComputeFallback;
else if (dev_mask == mshadow::cpu::kDevMask && support_mkldnn)
wanted_mode = DispatchMode::kFComputeEx;
else
#endif
wanted_mode = DispatchMode::kFCompute;

bool dispatched = false;
Expand Down
Loading